1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * Changes: 14 */ 15 16 #define KMSG_COMPONENT "IPVS" 17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 18 19 #include <linux/module.h> 20 #include <linux/init.h> 21 #include <linux/types.h> 22 #include <linux/capability.h> 23 #include <linux/fs.h> 24 #include <linux/sysctl.h> 25 #include <linux/proc_fs.h> 26 #include <linux/workqueue.h> 27 #include <linux/seq_file.h> 28 #include <linux/slab.h> 29 30 #include <linux/netfilter.h> 31 #include <linux/netfilter_ipv4.h> 32 #include <linux/mutex.h> 33 34 #include <net/net_namespace.h> 35 #include <linux/nsproxy.h> 36 #include <net/ip.h> 37 #ifdef CONFIG_IP_VS_IPV6 38 #include <net/ipv6.h> 39 #include <net/ip6_route.h> 40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 41 #endif 42 #include <net/route.h> 43 #include <net/sock.h> 44 #include <net/genetlink.h> 45 46 #include <linux/uaccess.h> 47 48 #include <net/ip_vs.h> 49 50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); 51 52 DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ 53 54 /* sysctl variables */ 55 56 #ifdef CONFIG_IP_VS_DEBUG 57 static int sysctl_ip_vs_debug_level = 0; 58 59 int ip_vs_get_debug_level(void) 60 { 61 return sysctl_ip_vs_debug_level; 62 } 63 #endif 64 65 66 /* Protos */ 67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); 68 69 70 #ifdef CONFIG_IP_VS_IPV6 71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 72 static bool __ip_vs_addr_is_local_v6(struct net *net, 73 const struct in6_addr *addr) 74 { 75 struct flowi6 fl6 = { 76 .daddr = *addr, 77 }; 78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); 79 bool is_local; 80 81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); 82 83 dst_release(dst); 84 return is_local; 85 } 86 #endif 87 88 #ifdef CONFIG_SYSCTL 89 /* 90 * update_defense_level is called from keventd and from sysctl, 91 * so it needs to protect itself from softirqs 92 */ 93 static void update_defense_level(struct netns_ipvs *ipvs) 94 { 95 struct sysinfo i; 96 int availmem; 97 int nomem; 98 int to_change = -1; 99 100 /* we only count free and buffered memory (in pages) */ 101 si_meminfo(&i); 102 availmem = i.freeram + i.bufferram; 103 /* however in linux 2.5 the i.bufferram is total page cache size, 104 we need adjust it */ 105 /* si_swapinfo(&i); */ 106 /* availmem = availmem - (i.totalswap - i.freeswap); */ 107 108 nomem = (availmem < ipvs->sysctl_amemthresh); 109 110 local_bh_disable(); 111 112 /* drop_entry */ 113 spin_lock(&ipvs->dropentry_lock); 114 switch (ipvs->sysctl_drop_entry) { 115 case 0: 116 atomic_set(&ipvs->dropentry, 0); 117 break; 118 case 1: 119 if (nomem) { 120 atomic_set(&ipvs->dropentry, 1); 121 ipvs->sysctl_drop_entry = 2; 122 } else { 123 atomic_set(&ipvs->dropentry, 0); 124 } 125 break; 126 case 2: 127 if (nomem) { 128 atomic_set(&ipvs->dropentry, 1); 129 } else { 130 atomic_set(&ipvs->dropentry, 0); 131 ipvs->sysctl_drop_entry = 1; 132 } 133 break; 134 case 3: 135 atomic_set(&ipvs->dropentry, 1); 136 break; 137 } 138 spin_unlock(&ipvs->dropentry_lock); 139 140 /* drop_packet */ 141 spin_lock(&ipvs->droppacket_lock); 142 switch (ipvs->sysctl_drop_packet) { 143 case 0: 144 ipvs->drop_rate = 0; 145 break; 146 case 1: 147 if (nomem) { 148 ipvs->drop_rate = ipvs->drop_counter 149 = ipvs->sysctl_amemthresh / 150 (ipvs->sysctl_amemthresh-availmem); 151 ipvs->sysctl_drop_packet = 2; 152 } else { 153 ipvs->drop_rate = 0; 154 } 155 break; 156 case 2: 157 if (nomem) { 158 ipvs->drop_rate = ipvs->drop_counter 159 = ipvs->sysctl_amemthresh / 160 (ipvs->sysctl_amemthresh-availmem); 161 } else { 162 ipvs->drop_rate = 0; 163 ipvs->sysctl_drop_packet = 1; 164 } 165 break; 166 case 3: 167 ipvs->drop_rate = ipvs->sysctl_am_droprate; 168 break; 169 } 170 spin_unlock(&ipvs->droppacket_lock); 171 172 /* secure_tcp */ 173 spin_lock(&ipvs->securetcp_lock); 174 switch (ipvs->sysctl_secure_tcp) { 175 case 0: 176 if (ipvs->old_secure_tcp >= 2) 177 to_change = 0; 178 break; 179 case 1: 180 if (nomem) { 181 if (ipvs->old_secure_tcp < 2) 182 to_change = 1; 183 ipvs->sysctl_secure_tcp = 2; 184 } else { 185 if (ipvs->old_secure_tcp >= 2) 186 to_change = 0; 187 } 188 break; 189 case 2: 190 if (nomem) { 191 if (ipvs->old_secure_tcp < 2) 192 to_change = 1; 193 } else { 194 if (ipvs->old_secure_tcp >= 2) 195 to_change = 0; 196 ipvs->sysctl_secure_tcp = 1; 197 } 198 break; 199 case 3: 200 if (ipvs->old_secure_tcp < 2) 201 to_change = 1; 202 break; 203 } 204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; 205 if (to_change >= 0) 206 ip_vs_protocol_timeout_change(ipvs, 207 ipvs->sysctl_secure_tcp > 1); 208 spin_unlock(&ipvs->securetcp_lock); 209 210 local_bh_enable(); 211 } 212 213 /* Handler for delayed work for expiring no 214 * destination connections 215 */ 216 static void expire_nodest_conn_handler(struct work_struct *work) 217 { 218 struct netns_ipvs *ipvs; 219 220 ipvs = container_of(work, struct netns_ipvs, 221 expire_nodest_conn_work.work); 222 ip_vs_expire_nodest_conn_flush(ipvs); 223 } 224 225 /* 226 * Timer for checking the defense 227 */ 228 #define DEFENSE_TIMER_PERIOD 1*HZ 229 230 static void defense_work_handler(struct work_struct *work) 231 { 232 struct netns_ipvs *ipvs = 233 container_of(work, struct netns_ipvs, defense_work.work); 234 235 update_defense_level(ipvs); 236 if (atomic_read(&ipvs->dropentry)) 237 ip_vs_random_dropentry(ipvs); 238 queue_delayed_work(system_long_wq, &ipvs->defense_work, 239 DEFENSE_TIMER_PERIOD); 240 } 241 #endif 242 243 static void est_reload_work_handler(struct work_struct *work) 244 { 245 struct netns_ipvs *ipvs = 246 container_of(work, struct netns_ipvs, est_reload_work.work); 247 int genid_done = atomic_read(&ipvs->est_genid_done); 248 unsigned long delay = HZ / 10; /* repeat startups after failure */ 249 bool repeat = false; 250 int genid; 251 int id; 252 253 mutex_lock(&ipvs->est_mutex); 254 genid = atomic_read(&ipvs->est_genid); 255 for (id = 0; id < ipvs->est_kt_count; id++) { 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 258 /* netns clean up started, abort delayed work */ 259 if (!ipvs->enable) 260 goto unlock; 261 if (!kd) 262 continue; 263 /* New config ? Stop kthread tasks */ 264 if (genid != genid_done) 265 ip_vs_est_kthread_stop(kd); 266 if (!kd->task && !ip_vs_est_stopped(ipvs)) { 267 /* Do not start kthreads above 0 in calc phase */ 268 if ((!id || !ipvs->est_calc_phase) && 269 ip_vs_est_kthread_start(ipvs, kd) < 0) 270 repeat = true; 271 } 272 } 273 274 atomic_set(&ipvs->est_genid_done, genid); 275 276 if (repeat) 277 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 278 delay); 279 280 unlock: 281 mutex_unlock(&ipvs->est_mutex); 282 } 283 284 int 285 ip_vs_use_count_inc(void) 286 { 287 return try_module_get(THIS_MODULE); 288 } 289 290 void 291 ip_vs_use_count_dec(void) 292 { 293 module_put(THIS_MODULE); 294 } 295 296 297 /* 298 * Hash table: for virtual service lookups 299 */ 300 #define IP_VS_SVC_TAB_BITS 8 301 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) 302 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) 303 304 /* the service table hashed by <protocol, addr, port> */ 305 static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; 306 /* the service table hashed by fwmark */ 307 static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; 308 309 310 /* 311 * Returns hash value for virtual service 312 */ 313 static inline unsigned int 314 ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, 315 const union nf_inet_addr *addr, __be16 port) 316 { 317 unsigned int porth = ntohs(port); 318 __be32 addr_fold = addr->ip; 319 __u32 ahash; 320 321 #ifdef CONFIG_IP_VS_IPV6 322 if (af == AF_INET6) 323 addr_fold = addr->ip6[0]^addr->ip6[1]^ 324 addr->ip6[2]^addr->ip6[3]; 325 #endif 326 ahash = ntohl(addr_fold); 327 ahash ^= ((size_t) ipvs >> 8); 328 329 return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & 330 IP_VS_SVC_TAB_MASK; 331 } 332 333 /* 334 * Returns hash value of fwmark for virtual service lookup 335 */ 336 static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) 337 { 338 return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; 339 } 340 341 /* 342 * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> 343 * or in the ip_vs_svc_fwm_table by fwmark. 344 * Should be called with locked tables. 345 */ 346 static int ip_vs_svc_hash(struct ip_vs_service *svc) 347 { 348 unsigned int hash; 349 350 if (svc->flags & IP_VS_SVC_F_HASHED) { 351 pr_err("%s(): request for already hashed, called from %pS\n", 352 __func__, __builtin_return_address(0)); 353 return 0; 354 } 355 356 if (svc->fwmark == 0) { 357 /* 358 * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table 359 */ 360 hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, 361 &svc->addr, svc->port); 362 hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); 363 } else { 364 /* 365 * Hash it by fwmark in svc_fwm_table 366 */ 367 hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); 368 hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); 369 } 370 371 svc->flags |= IP_VS_SVC_F_HASHED; 372 /* increase its refcnt because it is referenced by the svc table */ 373 atomic_inc(&svc->refcnt); 374 return 1; 375 } 376 377 378 /* 379 * Unhashes a service from svc_table / svc_fwm_table. 380 * Should be called with locked tables. 381 */ 382 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 383 { 384 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 385 pr_err("%s(): request for unhash flagged, called from %pS\n", 386 __func__, __builtin_return_address(0)); 387 return 0; 388 } 389 390 if (svc->fwmark == 0) { 391 /* Remove it from the svc_table table */ 392 hlist_del_rcu(&svc->s_list); 393 } else { 394 /* Remove it from the svc_fwm_table table */ 395 hlist_del_rcu(&svc->f_list); 396 } 397 398 svc->flags &= ~IP_VS_SVC_F_HASHED; 399 atomic_dec(&svc->refcnt); 400 return 1; 401 } 402 403 404 /* 405 * Get service by {netns, proto,addr,port} in the service table. 406 */ 407 static inline struct ip_vs_service * 408 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 409 const union nf_inet_addr *vaddr, __be16 vport) 410 { 411 unsigned int hash; 412 struct ip_vs_service *svc; 413 414 /* Check for "full" addressed entries */ 415 hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); 416 417 hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { 418 if ((svc->af == af) 419 && ip_vs_addr_equal(af, &svc->addr, vaddr) 420 && (svc->port == vport) 421 && (svc->protocol == protocol) 422 && (svc->ipvs == ipvs)) { 423 /* HIT */ 424 return svc; 425 } 426 } 427 428 return NULL; 429 } 430 431 432 /* 433 * Get service by {fwmark} in the service table. 434 */ 435 static inline struct ip_vs_service * 436 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 437 { 438 unsigned int hash; 439 struct ip_vs_service *svc; 440 441 /* Check for fwmark addressed entries */ 442 hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); 443 444 hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { 445 if (svc->fwmark == fwmark && svc->af == af 446 && (svc->ipvs == ipvs)) { 447 /* HIT */ 448 return svc; 449 } 450 } 451 452 return NULL; 453 } 454 455 /* Find service, called under RCU lock */ 456 struct ip_vs_service * 457 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, 458 const union nf_inet_addr *vaddr, __be16 vport) 459 { 460 struct ip_vs_service *svc; 461 462 /* 463 * Check the table hashed by fwmark first 464 */ 465 if (fwmark) { 466 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); 467 if (svc) 468 goto out; 469 } 470 471 /* 472 * Check the table hashed by <protocol,addr,port> 473 * for "full" addressed entries 474 */ 475 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 476 477 if (!svc && protocol == IPPROTO_TCP && 478 atomic_read(&ipvs->ftpsvc_counter) && 479 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { 480 /* 481 * Check if ftp service entry exists, the packet 482 * might belong to FTP data connections. 483 */ 484 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); 485 } 486 487 if (svc == NULL 488 && atomic_read(&ipvs->nullsvc_counter)) { 489 /* 490 * Check if the catch-all port (port zero) exists 491 */ 492 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); 493 } 494 495 out: 496 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 497 fwmark, ip_vs_proto_name(protocol), 498 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), 499 svc ? "hit" : "not hit"); 500 501 return svc; 502 } 503 504 505 static inline void 506 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 507 { 508 atomic_inc(&svc->refcnt); 509 rcu_assign_pointer(dest->svc, svc); 510 } 511 512 static void ip_vs_service_free(struct ip_vs_service *svc) 513 { 514 ip_vs_stats_release(&svc->stats); 515 kfree(svc); 516 } 517 518 static void ip_vs_service_rcu_free(struct rcu_head *head) 519 { 520 struct ip_vs_service *svc; 521 522 svc = container_of(head, struct ip_vs_service, rcu_head); 523 ip_vs_service_free(svc); 524 } 525 526 static void __ip_vs_svc_put(struct ip_vs_service *svc) 527 { 528 if (atomic_dec_and_test(&svc->refcnt)) { 529 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 530 svc->fwmark, 531 IP_VS_DBG_ADDR(svc->af, &svc->addr), 532 ntohs(svc->port)); 533 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); 534 } 535 } 536 537 538 /* 539 * Returns hash value for real service 540 */ 541 static inline unsigned int ip_vs_rs_hashkey(int af, 542 const union nf_inet_addr *addr, 543 __be16 port) 544 { 545 unsigned int porth = ntohs(port); 546 __be32 addr_fold = addr->ip; 547 548 #ifdef CONFIG_IP_VS_IPV6 549 if (af == AF_INET6) 550 addr_fold = addr->ip6[0]^addr->ip6[1]^ 551 addr->ip6[2]^addr->ip6[3]; 552 #endif 553 554 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) 555 & IP_VS_RTAB_MASK; 556 } 557 558 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ 559 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) 560 { 561 unsigned int hash; 562 __be16 port; 563 564 if (dest->in_rs_table) 565 return; 566 567 switch (IP_VS_DFWD_METHOD(dest)) { 568 case IP_VS_CONN_F_MASQ: 569 port = dest->port; 570 break; 571 case IP_VS_CONN_F_TUNNEL: 572 switch (dest->tun_type) { 573 case IP_VS_CONN_F_TUNNEL_TYPE_GUE: 574 port = dest->tun_port; 575 break; 576 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: 577 case IP_VS_CONN_F_TUNNEL_TYPE_GRE: 578 port = 0; 579 break; 580 default: 581 return; 582 } 583 break; 584 default: 585 return; 586 } 587 588 /* 589 * Hash by proto,addr,port, 590 * which are the parameters of the real service. 591 */ 592 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); 593 594 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); 595 dest->in_rs_table = 1; 596 } 597 598 /* Unhash ip_vs_dest from rs_table. */ 599 static void ip_vs_rs_unhash(struct ip_vs_dest *dest) 600 { 601 /* 602 * Remove it from the rs_table table. 603 */ 604 if (dest->in_rs_table) { 605 hlist_del_rcu(&dest->d_list); 606 dest->in_rs_table = 0; 607 } 608 } 609 610 /* Check if real service by <proto,addr,port> is present */ 611 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, 612 const union nf_inet_addr *daddr, __be16 dport) 613 { 614 unsigned int hash; 615 struct ip_vs_dest *dest; 616 617 /* Check for "full" addressed entries */ 618 hash = ip_vs_rs_hashkey(af, daddr, dport); 619 620 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 621 if (dest->port == dport && 622 dest->af == af && 623 ip_vs_addr_equal(af, &dest->addr, daddr) && 624 (dest->protocol == protocol || dest->vfwmark) && 625 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 626 /* HIT */ 627 return true; 628 } 629 } 630 631 return false; 632 } 633 634 /* Find real service record by <proto,addr,port>. 635 * In case of multiple records with the same <proto,addr,port>, only 636 * the first found record is returned. 637 * 638 * To be called under RCU lock. 639 */ 640 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, 641 __u16 protocol, 642 const union nf_inet_addr *daddr, 643 __be16 dport) 644 { 645 unsigned int hash; 646 struct ip_vs_dest *dest; 647 648 /* Check for "full" addressed entries */ 649 hash = ip_vs_rs_hashkey(af, daddr, dport); 650 651 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 652 if (dest->port == dport && 653 dest->af == af && 654 ip_vs_addr_equal(af, &dest->addr, daddr) && 655 (dest->protocol == protocol || dest->vfwmark) && 656 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 657 /* HIT */ 658 return dest; 659 } 660 } 661 662 return NULL; 663 } 664 665 /* Find real service record by <af,addr,tun_port>. 666 * In case of multiple records with the same <af,addr,tun_port>, only 667 * the first found record is returned. 668 * 669 * To be called under RCU lock. 670 */ 671 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, 672 const union nf_inet_addr *daddr, 673 __be16 tun_port) 674 { 675 struct ip_vs_dest *dest; 676 unsigned int hash; 677 678 /* Check for "full" addressed entries */ 679 hash = ip_vs_rs_hashkey(af, daddr, tun_port); 680 681 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 682 if (dest->tun_port == tun_port && 683 dest->af == af && 684 ip_vs_addr_equal(af, &dest->addr, daddr) && 685 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { 686 /* HIT */ 687 return dest; 688 } 689 } 690 691 return NULL; 692 } 693 694 /* Lookup destination by {addr,port} in the given service 695 * Called under RCU lock. 696 */ 697 static struct ip_vs_dest * 698 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, 699 const union nf_inet_addr *daddr, __be16 dport) 700 { 701 struct ip_vs_dest *dest; 702 703 /* 704 * Find the destination for the given service 705 */ 706 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 707 if ((dest->af == dest_af) && 708 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 709 (dest->port == dport)) { 710 /* HIT */ 711 return dest; 712 } 713 } 714 715 return NULL; 716 } 717 718 /* 719 * Find destination by {daddr,dport,vaddr,protocol} 720 * Created to be used in ip_vs_process_message() in 721 * the backup synchronization daemon. It finds the 722 * destination to be bound to the received connection 723 * on the backup. 724 * Called under RCU lock, no refcnt is returned. 725 */ 726 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, 727 const union nf_inet_addr *daddr, 728 __be16 dport, 729 const union nf_inet_addr *vaddr, 730 __be16 vport, __u16 protocol, __u32 fwmark, 731 __u32 flags) 732 { 733 struct ip_vs_dest *dest; 734 struct ip_vs_service *svc; 735 __be16 port = dport; 736 737 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); 738 if (!svc) 739 return NULL; 740 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) 741 port = 0; 742 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); 743 if (!dest) 744 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); 745 return dest; 746 } 747 748 void ip_vs_dest_dst_rcu_free(struct rcu_head *head) 749 { 750 struct ip_vs_dest_dst *dest_dst = container_of(head, 751 struct ip_vs_dest_dst, 752 rcu_head); 753 754 dst_release(dest_dst->dst_cache); 755 kfree(dest_dst); 756 } 757 758 /* Release dest_dst and dst_cache for dest in user context */ 759 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) 760 { 761 struct ip_vs_dest_dst *old; 762 763 old = rcu_dereference_protected(dest->dest_dst, 1); 764 if (old) { 765 RCU_INIT_POINTER(dest->dest_dst, NULL); 766 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 767 } 768 } 769 770 /* 771 * Lookup dest by {svc,addr,port} in the destination trash. 772 * The destination trash is used to hold the destinations that are removed 773 * from the service table but are still referenced by some conn entries. 774 * The reason to add the destination trash is when the dest is temporary 775 * down (either by administrator or by monitor program), the dest can be 776 * picked back from the trash, the remaining connections to the dest can 777 * continue, and the counting information of the dest is also useful for 778 * scheduling. 779 */ 780 static struct ip_vs_dest * 781 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, 782 const union nf_inet_addr *daddr, __be16 dport) 783 { 784 struct ip_vs_dest *dest; 785 struct netns_ipvs *ipvs = svc->ipvs; 786 787 /* 788 * Find the destination in trash 789 */ 790 spin_lock_bh(&ipvs->dest_trash_lock); 791 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 792 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 793 "dest->refcnt=%d\n", 794 dest->vfwmark, 795 IP_VS_DBG_ADDR(dest->af, &dest->addr), 796 ntohs(dest->port), 797 refcount_read(&dest->refcnt)); 798 if (dest->af == dest_af && 799 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 800 dest->port == dport && 801 dest->vfwmark == svc->fwmark && 802 dest->protocol == svc->protocol && 803 (svc->fwmark || 804 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 805 dest->vport == svc->port))) { 806 /* HIT */ 807 list_del(&dest->t_list); 808 goto out; 809 } 810 } 811 812 dest = NULL; 813 814 out: 815 spin_unlock_bh(&ipvs->dest_trash_lock); 816 817 return dest; 818 } 819 820 static void ip_vs_dest_rcu_free(struct rcu_head *head) 821 { 822 struct ip_vs_dest *dest; 823 824 dest = container_of(head, struct ip_vs_dest, rcu_head); 825 ip_vs_stats_release(&dest->stats); 826 ip_vs_dest_put_and_free(dest); 827 } 828 829 static void ip_vs_dest_free(struct ip_vs_dest *dest) 830 { 831 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); 832 833 __ip_vs_dst_cache_reset(dest); 834 __ip_vs_svc_put(svc); 835 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); 836 } 837 838 /* 839 * Clean up all the destinations in the trash 840 * Called by the ip_vs_control_cleanup() 841 * 842 * When the ip_vs_control_clearup is activated by ipvs module exit, 843 * the service tables must have been flushed and all the connections 844 * are expired, and the refcnt of each destination in the trash must 845 * be 1, so we simply release them here. 846 */ 847 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 848 { 849 struct ip_vs_dest *dest, *nxt; 850 851 del_timer_sync(&ipvs->dest_trash_timer); 852 /* No need to use dest_trash_lock */ 853 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { 854 list_del(&dest->t_list); 855 ip_vs_dest_free(dest); 856 } 857 } 858 859 static void ip_vs_stats_rcu_free(struct rcu_head *head) 860 { 861 struct ip_vs_stats_rcu *rs = container_of(head, 862 struct ip_vs_stats_rcu, 863 rcu_head); 864 865 ip_vs_stats_release(&rs->s); 866 kfree(rs); 867 } 868 869 static void 870 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) 871 { 872 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c 873 874 spin_lock(&src->lock); 875 876 IP_VS_SHOW_STATS_COUNTER(conns); 877 IP_VS_SHOW_STATS_COUNTER(inpkts); 878 IP_VS_SHOW_STATS_COUNTER(outpkts); 879 IP_VS_SHOW_STATS_COUNTER(inbytes); 880 IP_VS_SHOW_STATS_COUNTER(outbytes); 881 882 ip_vs_read_estimator(dst, src); 883 884 spin_unlock(&src->lock); 885 } 886 887 static void 888 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) 889 { 890 dst->conns = (u32)src->conns; 891 dst->inpkts = (u32)src->inpkts; 892 dst->outpkts = (u32)src->outpkts; 893 dst->inbytes = src->inbytes; 894 dst->outbytes = src->outbytes; 895 dst->cps = (u32)src->cps; 896 dst->inpps = (u32)src->inpps; 897 dst->outpps = (u32)src->outpps; 898 dst->inbps = (u32)src->inbps; 899 dst->outbps = (u32)src->outbps; 900 } 901 902 static void 903 ip_vs_zero_stats(struct ip_vs_stats *stats) 904 { 905 spin_lock(&stats->lock); 906 907 /* get current counters as zero point, rates are zeroed */ 908 909 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c 910 911 IP_VS_ZERO_STATS_COUNTER(conns); 912 IP_VS_ZERO_STATS_COUNTER(inpkts); 913 IP_VS_ZERO_STATS_COUNTER(outpkts); 914 IP_VS_ZERO_STATS_COUNTER(inbytes); 915 IP_VS_ZERO_STATS_COUNTER(outbytes); 916 917 ip_vs_zero_estimator(stats); 918 919 spin_unlock(&stats->lock); 920 } 921 922 /* Allocate fields after kzalloc */ 923 int ip_vs_stats_init_alloc(struct ip_vs_stats *s) 924 { 925 int i; 926 927 spin_lock_init(&s->lock); 928 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); 929 if (!s->cpustats) 930 return -ENOMEM; 931 932 for_each_possible_cpu(i) { 933 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); 934 935 u64_stats_init(&cs->syncp); 936 } 937 return 0; 938 } 939 940 struct ip_vs_stats *ip_vs_stats_alloc(void) 941 { 942 struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); 943 944 if (s && ip_vs_stats_init_alloc(s) >= 0) 945 return s; 946 kfree(s); 947 return NULL; 948 } 949 950 void ip_vs_stats_release(struct ip_vs_stats *stats) 951 { 952 free_percpu(stats->cpustats); 953 } 954 955 void ip_vs_stats_free(struct ip_vs_stats *stats) 956 { 957 if (stats) { 958 ip_vs_stats_release(stats); 959 kfree(stats); 960 } 961 } 962 963 /* 964 * Update a destination in the given service 965 */ 966 static void 967 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 968 struct ip_vs_dest_user_kern *udest, int add) 969 { 970 struct netns_ipvs *ipvs = svc->ipvs; 971 struct ip_vs_service *old_svc; 972 struct ip_vs_scheduler *sched; 973 int conn_flags; 974 975 /* We cannot modify an address and change the address family */ 976 BUG_ON(!add && udest->af != dest->af); 977 978 if (add && udest->af != svc->af) 979 ipvs->mixed_address_family_dests++; 980 981 /* keep the last_weight with latest non-0 weight */ 982 if (add || udest->weight != 0) 983 atomic_set(&dest->last_weight, udest->weight); 984 985 /* set the weight and the flags */ 986 atomic_set(&dest->weight, udest->weight); 987 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 988 conn_flags |= IP_VS_CONN_F_INACTIVE; 989 990 /* Need to rehash? */ 991 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != 992 IP_VS_DFWD_METHOD(dest) || 993 udest->tun_type != dest->tun_type || 994 udest->tun_port != dest->tun_port) 995 ip_vs_rs_unhash(dest); 996 997 /* set the tunnel info */ 998 dest->tun_type = udest->tun_type; 999 dest->tun_port = udest->tun_port; 1000 dest->tun_flags = udest->tun_flags; 1001 1002 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 1003 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 1004 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 1005 } else { 1006 /* FTP-NAT requires conntrack for mangling */ 1007 if (svc->port == FTPPORT) 1008 ip_vs_register_conntrack(svc); 1009 } 1010 atomic_set(&dest->conn_flags, conn_flags); 1011 /* Put the real service in rs_table if not present. */ 1012 ip_vs_rs_hash(ipvs, dest); 1013 1014 /* bind the service */ 1015 old_svc = rcu_dereference_protected(dest->svc, 1); 1016 if (!old_svc) { 1017 __ip_vs_bind_svc(dest, svc); 1018 } else { 1019 if (old_svc != svc) { 1020 ip_vs_zero_stats(&dest->stats); 1021 __ip_vs_bind_svc(dest, svc); 1022 __ip_vs_svc_put(old_svc); 1023 } 1024 } 1025 1026 /* set the dest status flags */ 1027 dest->flags |= IP_VS_DEST_F_AVAILABLE; 1028 1029 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) 1030 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 1031 dest->u_threshold = udest->u_threshold; 1032 dest->l_threshold = udest->l_threshold; 1033 1034 dest->af = udest->af; 1035 1036 spin_lock_bh(&dest->dst_lock); 1037 __ip_vs_dst_cache_reset(dest); 1038 spin_unlock_bh(&dest->dst_lock); 1039 1040 if (add) { 1041 list_add_rcu(&dest->n_list, &svc->destinations); 1042 svc->num_dests++; 1043 sched = rcu_dereference_protected(svc->scheduler, 1); 1044 if (sched && sched->add_dest) 1045 sched->add_dest(svc, dest); 1046 } else { 1047 sched = rcu_dereference_protected(svc->scheduler, 1); 1048 if (sched && sched->upd_dest) 1049 sched->upd_dest(svc, dest); 1050 } 1051 } 1052 1053 1054 /* 1055 * Create a destination for the given service 1056 */ 1057 static int 1058 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1059 { 1060 struct ip_vs_dest *dest; 1061 unsigned int atype; 1062 int ret; 1063 1064 EnterFunction(2); 1065 1066 #ifdef CONFIG_IP_VS_IPV6 1067 if (udest->af == AF_INET6) { 1068 atype = ipv6_addr_type(&udest->addr.in6); 1069 if ((!(atype & IPV6_ADDR_UNICAST) || 1070 atype & IPV6_ADDR_LINKLOCAL) && 1071 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) 1072 return -EINVAL; 1073 1074 ret = nf_defrag_ipv6_enable(svc->ipvs->net); 1075 if (ret) 1076 return ret; 1077 } else 1078 #endif 1079 { 1080 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); 1081 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 1082 return -EINVAL; 1083 } 1084 1085 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); 1086 if (dest == NULL) 1087 return -ENOMEM; 1088 1089 ret = ip_vs_stats_init_alloc(&dest->stats); 1090 if (ret < 0) 1091 goto err_alloc; 1092 1093 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1094 if (ret < 0) 1095 goto err_stats; 1096 1097 dest->af = udest->af; 1098 dest->protocol = svc->protocol; 1099 dest->vaddr = svc->addr; 1100 dest->vport = svc->port; 1101 dest->vfwmark = svc->fwmark; 1102 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); 1103 dest->port = udest->port; 1104 1105 atomic_set(&dest->activeconns, 0); 1106 atomic_set(&dest->inactconns, 0); 1107 atomic_set(&dest->persistconns, 0); 1108 refcount_set(&dest->refcnt, 1); 1109 1110 INIT_HLIST_NODE(&dest->d_list); 1111 spin_lock_init(&dest->dst_lock); 1112 __ip_vs_update_dest(svc, dest, udest, 1); 1113 1114 LeaveFunction(2); 1115 return 0; 1116 1117 err_stats: 1118 ip_vs_stats_release(&dest->stats); 1119 1120 err_alloc: 1121 kfree(dest); 1122 return ret; 1123 } 1124 1125 1126 /* 1127 * Add a destination into an existing service 1128 */ 1129 static int 1130 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1131 { 1132 struct ip_vs_dest *dest; 1133 union nf_inet_addr daddr; 1134 __be16 dport = udest->port; 1135 int ret; 1136 1137 EnterFunction(2); 1138 1139 if (udest->weight < 0) { 1140 pr_err("%s(): server weight less than zero\n", __func__); 1141 return -ERANGE; 1142 } 1143 1144 if (udest->l_threshold > udest->u_threshold) { 1145 pr_err("%s(): lower threshold is higher than upper threshold\n", 1146 __func__); 1147 return -ERANGE; 1148 } 1149 1150 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1151 if (udest->tun_port == 0) { 1152 pr_err("%s(): tunnel port is zero\n", __func__); 1153 return -EINVAL; 1154 } 1155 } 1156 1157 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1158 1159 /* We use function that requires RCU lock */ 1160 rcu_read_lock(); 1161 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1162 rcu_read_unlock(); 1163 1164 if (dest != NULL) { 1165 IP_VS_DBG(1, "%s(): dest already exists\n", __func__); 1166 return -EEXIST; 1167 } 1168 1169 /* 1170 * Check if the dest already exists in the trash and 1171 * is from the same service 1172 */ 1173 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); 1174 1175 if (dest != NULL) { 1176 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " 1177 "dest->refcnt=%d, service %u/%s:%u\n", 1178 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), 1179 refcount_read(&dest->refcnt), 1180 dest->vfwmark, 1181 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 1182 ntohs(dest->vport)); 1183 1184 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1185 if (ret < 0) 1186 goto err; 1187 __ip_vs_update_dest(svc, dest, udest, 1); 1188 } else { 1189 /* 1190 * Allocate and initialize the dest structure 1191 */ 1192 ret = ip_vs_new_dest(svc, udest); 1193 } 1194 1195 err: 1196 LeaveFunction(2); 1197 1198 return ret; 1199 } 1200 1201 1202 /* 1203 * Edit a destination in the given service 1204 */ 1205 static int 1206 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1207 { 1208 struct ip_vs_dest *dest; 1209 union nf_inet_addr daddr; 1210 __be16 dport = udest->port; 1211 1212 EnterFunction(2); 1213 1214 if (udest->weight < 0) { 1215 pr_err("%s(): server weight less than zero\n", __func__); 1216 return -ERANGE; 1217 } 1218 1219 if (udest->l_threshold > udest->u_threshold) { 1220 pr_err("%s(): lower threshold is higher than upper threshold\n", 1221 __func__); 1222 return -ERANGE; 1223 } 1224 1225 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1226 if (udest->tun_port == 0) { 1227 pr_err("%s(): tunnel port is zero\n", __func__); 1228 return -EINVAL; 1229 } 1230 } 1231 1232 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1233 1234 /* We use function that requires RCU lock */ 1235 rcu_read_lock(); 1236 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1237 rcu_read_unlock(); 1238 1239 if (dest == NULL) { 1240 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); 1241 return -ENOENT; 1242 } 1243 1244 __ip_vs_update_dest(svc, dest, udest, 0); 1245 LeaveFunction(2); 1246 1247 return 0; 1248 } 1249 1250 /* 1251 * Delete a destination (must be already unlinked from the service) 1252 */ 1253 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, 1254 bool cleanup) 1255 { 1256 ip_vs_stop_estimator(ipvs, &dest->stats); 1257 1258 /* 1259 * Remove it from the d-linked list with the real services. 1260 */ 1261 ip_vs_rs_unhash(dest); 1262 1263 spin_lock_bh(&ipvs->dest_trash_lock); 1264 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1265 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1266 refcount_read(&dest->refcnt)); 1267 if (list_empty(&ipvs->dest_trash) && !cleanup) 1268 mod_timer(&ipvs->dest_trash_timer, 1269 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1270 /* dest lives in trash with reference */ 1271 list_add(&dest->t_list, &ipvs->dest_trash); 1272 dest->idle_start = 0; 1273 spin_unlock_bh(&ipvs->dest_trash_lock); 1274 1275 /* Queue up delayed work to expire all no destination connections. 1276 * No-op when CONFIG_SYSCTL is disabled. 1277 */ 1278 if (!cleanup) 1279 ip_vs_enqueue_expire_nodest_conns(ipvs); 1280 } 1281 1282 1283 /* 1284 * Unlink a destination from the given service 1285 */ 1286 static void __ip_vs_unlink_dest(struct ip_vs_service *svc, 1287 struct ip_vs_dest *dest, 1288 int svcupd) 1289 { 1290 dest->flags &= ~IP_VS_DEST_F_AVAILABLE; 1291 1292 /* 1293 * Remove it from the d-linked destination list. 1294 */ 1295 list_del_rcu(&dest->n_list); 1296 svc->num_dests--; 1297 1298 if (dest->af != svc->af) 1299 svc->ipvs->mixed_address_family_dests--; 1300 1301 if (svcupd) { 1302 struct ip_vs_scheduler *sched; 1303 1304 sched = rcu_dereference_protected(svc->scheduler, 1); 1305 if (sched && sched->del_dest) 1306 sched->del_dest(svc, dest); 1307 } 1308 } 1309 1310 1311 /* 1312 * Delete a destination server in the given service 1313 */ 1314 static int 1315 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1316 { 1317 struct ip_vs_dest *dest; 1318 __be16 dport = udest->port; 1319 1320 EnterFunction(2); 1321 1322 /* We use function that requires RCU lock */ 1323 rcu_read_lock(); 1324 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); 1325 rcu_read_unlock(); 1326 1327 if (dest == NULL) { 1328 IP_VS_DBG(1, "%s(): destination not found!\n", __func__); 1329 return -ENOENT; 1330 } 1331 1332 /* 1333 * Unlink dest from the service 1334 */ 1335 __ip_vs_unlink_dest(svc, dest, 1); 1336 1337 /* 1338 * Delete the destination 1339 */ 1340 __ip_vs_del_dest(svc->ipvs, dest, false); 1341 1342 LeaveFunction(2); 1343 1344 return 0; 1345 } 1346 1347 static void ip_vs_dest_trash_expire(struct timer_list *t) 1348 { 1349 struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); 1350 struct ip_vs_dest *dest, *next; 1351 unsigned long now = jiffies; 1352 1353 spin_lock(&ipvs->dest_trash_lock); 1354 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1355 if (refcount_read(&dest->refcnt) > 1) 1356 continue; 1357 if (dest->idle_start) { 1358 if (time_before(now, dest->idle_start + 1359 IP_VS_DEST_TRASH_PERIOD)) 1360 continue; 1361 } else { 1362 dest->idle_start = max(1UL, now); 1363 continue; 1364 } 1365 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1366 dest->vfwmark, 1367 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1368 ntohs(dest->port)); 1369 list_del(&dest->t_list); 1370 ip_vs_dest_free(dest); 1371 } 1372 if (!list_empty(&ipvs->dest_trash)) 1373 mod_timer(&ipvs->dest_trash_timer, 1374 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1375 spin_unlock(&ipvs->dest_trash_lock); 1376 } 1377 1378 /* 1379 * Add a service into the service hash table 1380 */ 1381 static int 1382 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1383 struct ip_vs_service **svc_p) 1384 { 1385 int ret = 0; 1386 struct ip_vs_scheduler *sched = NULL; 1387 struct ip_vs_pe *pe = NULL; 1388 struct ip_vs_service *svc = NULL; 1389 int ret_hooks = -1; 1390 1391 /* increase the module use count */ 1392 if (!ip_vs_use_count_inc()) 1393 return -ENOPROTOOPT; 1394 1395 /* Lookup the scheduler by 'u->sched_name' */ 1396 if (strcmp(u->sched_name, "none")) { 1397 sched = ip_vs_scheduler_get(u->sched_name); 1398 if (!sched) { 1399 pr_info("Scheduler module ip_vs_%s not found\n", 1400 u->sched_name); 1401 ret = -ENOENT; 1402 goto out_err; 1403 } 1404 } 1405 1406 if (u->pe_name && *u->pe_name) { 1407 pe = ip_vs_pe_getbyname(u->pe_name); 1408 if (pe == NULL) { 1409 pr_info("persistence engine module ip_vs_pe_%s " 1410 "not found\n", u->pe_name); 1411 ret = -ENOENT; 1412 goto out_err; 1413 } 1414 } 1415 1416 #ifdef CONFIG_IP_VS_IPV6 1417 if (u->af == AF_INET6) { 1418 __u32 plen = (__force __u32) u->netmask; 1419 1420 if (plen < 1 || plen > 128) { 1421 ret = -EINVAL; 1422 goto out_err; 1423 } 1424 1425 ret = nf_defrag_ipv6_enable(ipvs->net); 1426 if (ret) 1427 goto out_err; 1428 } 1429 #endif 1430 1431 if ((u->af == AF_INET && !ipvs->num_services) || 1432 (u->af == AF_INET6 && !ipvs->num_services6)) { 1433 ret = ip_vs_register_hooks(ipvs, u->af); 1434 if (ret < 0) 1435 goto out_err; 1436 ret_hooks = ret; 1437 } 1438 1439 svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); 1440 if (svc == NULL) { 1441 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1442 ret = -ENOMEM; 1443 goto out_err; 1444 } 1445 ret = ip_vs_stats_init_alloc(&svc->stats); 1446 if (ret < 0) 1447 goto out_err; 1448 1449 /* I'm the first user of the service */ 1450 atomic_set(&svc->refcnt, 0); 1451 1452 svc->af = u->af; 1453 svc->protocol = u->protocol; 1454 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); 1455 svc->port = u->port; 1456 svc->fwmark = u->fwmark; 1457 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; 1458 svc->timeout = u->timeout * HZ; 1459 svc->netmask = u->netmask; 1460 svc->ipvs = ipvs; 1461 1462 INIT_LIST_HEAD(&svc->destinations); 1463 spin_lock_init(&svc->sched_lock); 1464 1465 /* Bind the scheduler */ 1466 if (sched) { 1467 ret = ip_vs_bind_scheduler(svc, sched); 1468 if (ret) 1469 goto out_err; 1470 sched = NULL; 1471 } 1472 1473 ret = ip_vs_start_estimator(ipvs, &svc->stats); 1474 if (ret < 0) 1475 goto out_err; 1476 1477 /* Bind the ct retriever */ 1478 RCU_INIT_POINTER(svc->pe, pe); 1479 pe = NULL; 1480 1481 /* Update the virtual service counters */ 1482 if (svc->port == FTPPORT) 1483 atomic_inc(&ipvs->ftpsvc_counter); 1484 else if (svc->port == 0) 1485 atomic_inc(&ipvs->nullsvc_counter); 1486 if (svc->pe && svc->pe->conn_out) 1487 atomic_inc(&ipvs->conn_out_counter); 1488 1489 /* Count only IPv4 services for old get/setsockopt interface */ 1490 if (svc->af == AF_INET) 1491 ipvs->num_services++; 1492 else if (svc->af == AF_INET6) 1493 ipvs->num_services6++; 1494 1495 /* Hash the service into the service table */ 1496 ip_vs_svc_hash(svc); 1497 1498 *svc_p = svc; 1499 1500 if (!ipvs->enable) { 1501 /* Now there is a service - full throttle */ 1502 ipvs->enable = 1; 1503 1504 /* Start estimation for first time */ 1505 ip_vs_est_reload_start(ipvs); 1506 } 1507 1508 return 0; 1509 1510 1511 out_err: 1512 if (ret_hooks >= 0) 1513 ip_vs_unregister_hooks(ipvs, u->af); 1514 if (svc != NULL) { 1515 ip_vs_unbind_scheduler(svc, sched); 1516 ip_vs_service_free(svc); 1517 } 1518 ip_vs_scheduler_put(sched); 1519 ip_vs_pe_put(pe); 1520 1521 /* decrease the module use count */ 1522 ip_vs_use_count_dec(); 1523 1524 return ret; 1525 } 1526 1527 1528 /* 1529 * Edit a service and bind it with a new scheduler 1530 */ 1531 static int 1532 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1533 { 1534 struct ip_vs_scheduler *sched = NULL, *old_sched; 1535 struct ip_vs_pe *pe = NULL, *old_pe = NULL; 1536 int ret = 0; 1537 bool new_pe_conn_out, old_pe_conn_out; 1538 1539 /* 1540 * Lookup the scheduler, by 'u->sched_name' 1541 */ 1542 if (strcmp(u->sched_name, "none")) { 1543 sched = ip_vs_scheduler_get(u->sched_name); 1544 if (!sched) { 1545 pr_info("Scheduler module ip_vs_%s not found\n", 1546 u->sched_name); 1547 return -ENOENT; 1548 } 1549 } 1550 old_sched = sched; 1551 1552 if (u->pe_name && *u->pe_name) { 1553 pe = ip_vs_pe_getbyname(u->pe_name); 1554 if (pe == NULL) { 1555 pr_info("persistence engine module ip_vs_pe_%s " 1556 "not found\n", u->pe_name); 1557 ret = -ENOENT; 1558 goto out; 1559 } 1560 old_pe = pe; 1561 } 1562 1563 #ifdef CONFIG_IP_VS_IPV6 1564 if (u->af == AF_INET6) { 1565 __u32 plen = (__force __u32) u->netmask; 1566 1567 if (plen < 1 || plen > 128) { 1568 ret = -EINVAL; 1569 goto out; 1570 } 1571 } 1572 #endif 1573 1574 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1575 if (sched != old_sched) { 1576 if (old_sched) { 1577 ip_vs_unbind_scheduler(svc, old_sched); 1578 RCU_INIT_POINTER(svc->scheduler, NULL); 1579 /* Wait all svc->sched_data users */ 1580 synchronize_rcu(); 1581 } 1582 /* Bind the new scheduler */ 1583 if (sched) { 1584 ret = ip_vs_bind_scheduler(svc, sched); 1585 if (ret) { 1586 ip_vs_scheduler_put(sched); 1587 goto out; 1588 } 1589 } 1590 } 1591 1592 /* 1593 * Set the flags and timeout value 1594 */ 1595 svc->flags = u->flags | IP_VS_SVC_F_HASHED; 1596 svc->timeout = u->timeout * HZ; 1597 svc->netmask = u->netmask; 1598 1599 old_pe = rcu_dereference_protected(svc->pe, 1); 1600 if (pe != old_pe) { 1601 rcu_assign_pointer(svc->pe, pe); 1602 /* check for optional methods in new pe */ 1603 new_pe_conn_out = (pe && pe->conn_out) ? true : false; 1604 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; 1605 if (new_pe_conn_out && !old_pe_conn_out) 1606 atomic_inc(&svc->ipvs->conn_out_counter); 1607 if (old_pe_conn_out && !new_pe_conn_out) 1608 atomic_dec(&svc->ipvs->conn_out_counter); 1609 } 1610 1611 out: 1612 ip_vs_scheduler_put(old_sched); 1613 ip_vs_pe_put(old_pe); 1614 return ret; 1615 } 1616 1617 /* 1618 * Delete a service from the service list 1619 * - The service must be unlinked, unlocked and not referenced! 1620 * - We are called under _bh lock 1621 */ 1622 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) 1623 { 1624 struct ip_vs_dest *dest, *nxt; 1625 struct ip_vs_scheduler *old_sched; 1626 struct ip_vs_pe *old_pe; 1627 struct netns_ipvs *ipvs = svc->ipvs; 1628 1629 if (svc->af == AF_INET) { 1630 ipvs->num_services--; 1631 if (!ipvs->num_services) 1632 ip_vs_unregister_hooks(ipvs, svc->af); 1633 } else if (svc->af == AF_INET6) { 1634 ipvs->num_services6--; 1635 if (!ipvs->num_services6) 1636 ip_vs_unregister_hooks(ipvs, svc->af); 1637 } 1638 1639 ip_vs_stop_estimator(svc->ipvs, &svc->stats); 1640 1641 /* Unbind scheduler */ 1642 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1643 ip_vs_unbind_scheduler(svc, old_sched); 1644 ip_vs_scheduler_put(old_sched); 1645 1646 /* Unbind persistence engine, keep svc->pe */ 1647 old_pe = rcu_dereference_protected(svc->pe, 1); 1648 if (old_pe && old_pe->conn_out) 1649 atomic_dec(&ipvs->conn_out_counter); 1650 ip_vs_pe_put(old_pe); 1651 1652 /* 1653 * Unlink the whole destination list 1654 */ 1655 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 1656 __ip_vs_unlink_dest(svc, dest, 0); 1657 __ip_vs_del_dest(svc->ipvs, dest, cleanup); 1658 } 1659 1660 /* 1661 * Update the virtual service counters 1662 */ 1663 if (svc->port == FTPPORT) 1664 atomic_dec(&ipvs->ftpsvc_counter); 1665 else if (svc->port == 0) 1666 atomic_dec(&ipvs->nullsvc_counter); 1667 1668 /* 1669 * Free the service if nobody refers to it 1670 */ 1671 __ip_vs_svc_put(svc); 1672 1673 /* decrease the module use count */ 1674 ip_vs_use_count_dec(); 1675 } 1676 1677 /* 1678 * Unlink a service from list and try to delete it if its refcnt reached 0 1679 */ 1680 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) 1681 { 1682 ip_vs_unregister_conntrack(svc); 1683 /* Hold svc to avoid double release from dest_trash */ 1684 atomic_inc(&svc->refcnt); 1685 /* 1686 * Unhash it from the service table 1687 */ 1688 ip_vs_svc_unhash(svc); 1689 1690 __ip_vs_del_service(svc, cleanup); 1691 } 1692 1693 /* 1694 * Delete a service from the service list 1695 */ 1696 static int ip_vs_del_service(struct ip_vs_service *svc) 1697 { 1698 if (svc == NULL) 1699 return -EEXIST; 1700 ip_vs_unlink_service(svc, false); 1701 1702 return 0; 1703 } 1704 1705 1706 /* 1707 * Flush all the virtual services 1708 */ 1709 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 1710 { 1711 int idx; 1712 struct ip_vs_service *svc; 1713 struct hlist_node *n; 1714 1715 /* 1716 * Flush the service table hashed by <netns,protocol,addr,port> 1717 */ 1718 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1719 hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], 1720 s_list) { 1721 if (svc->ipvs == ipvs) 1722 ip_vs_unlink_service(svc, cleanup); 1723 } 1724 } 1725 1726 /* 1727 * Flush the service table hashed by fwmark 1728 */ 1729 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1730 hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], 1731 f_list) { 1732 if (svc->ipvs == ipvs) 1733 ip_vs_unlink_service(svc, cleanup); 1734 } 1735 } 1736 1737 return 0; 1738 } 1739 1740 /* 1741 * Delete service by {netns} in the service table. 1742 * Called by __ip_vs_batch_cleanup() 1743 */ 1744 void ip_vs_service_nets_cleanup(struct list_head *net_list) 1745 { 1746 struct netns_ipvs *ipvs; 1747 struct net *net; 1748 1749 EnterFunction(2); 1750 /* Check for "full" addressed entries */ 1751 mutex_lock(&__ip_vs_mutex); 1752 list_for_each_entry(net, net_list, exit_list) { 1753 ipvs = net_ipvs(net); 1754 ip_vs_flush(ipvs, true); 1755 } 1756 mutex_unlock(&__ip_vs_mutex); 1757 LeaveFunction(2); 1758 } 1759 1760 /* Put all references for device (dst_cache) */ 1761 static inline void 1762 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) 1763 { 1764 struct ip_vs_dest_dst *dest_dst; 1765 1766 spin_lock_bh(&dest->dst_lock); 1767 dest_dst = rcu_dereference_protected(dest->dest_dst, 1); 1768 if (dest_dst && dest_dst->dst_cache->dev == dev) { 1769 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", 1770 dev->name, 1771 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1772 ntohs(dest->port), 1773 refcount_read(&dest->refcnt)); 1774 __ip_vs_dst_cache_reset(dest); 1775 } 1776 spin_unlock_bh(&dest->dst_lock); 1777 1778 } 1779 /* Netdev event receiver 1780 * Currently only NETDEV_DOWN is handled to release refs to cached dsts 1781 */ 1782 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, 1783 void *ptr) 1784 { 1785 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1786 struct net *net = dev_net(dev); 1787 struct netns_ipvs *ipvs = net_ipvs(net); 1788 struct ip_vs_service *svc; 1789 struct ip_vs_dest *dest; 1790 unsigned int idx; 1791 1792 if (event != NETDEV_DOWN || !ipvs) 1793 return NOTIFY_DONE; 1794 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 1795 EnterFunction(2); 1796 mutex_lock(&__ip_vs_mutex); 1797 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1798 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 1799 if (svc->ipvs == ipvs) { 1800 list_for_each_entry(dest, &svc->destinations, 1801 n_list) { 1802 ip_vs_forget_dev(dest, dev); 1803 } 1804 } 1805 } 1806 1807 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 1808 if (svc->ipvs == ipvs) { 1809 list_for_each_entry(dest, &svc->destinations, 1810 n_list) { 1811 ip_vs_forget_dev(dest, dev); 1812 } 1813 } 1814 1815 } 1816 } 1817 1818 spin_lock_bh(&ipvs->dest_trash_lock); 1819 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 1820 ip_vs_forget_dev(dest, dev); 1821 } 1822 spin_unlock_bh(&ipvs->dest_trash_lock); 1823 mutex_unlock(&__ip_vs_mutex); 1824 LeaveFunction(2); 1825 return NOTIFY_DONE; 1826 } 1827 1828 /* 1829 * Zero counters in a service or all services 1830 */ 1831 static int ip_vs_zero_service(struct ip_vs_service *svc) 1832 { 1833 struct ip_vs_dest *dest; 1834 1835 list_for_each_entry(dest, &svc->destinations, n_list) { 1836 ip_vs_zero_stats(&dest->stats); 1837 } 1838 ip_vs_zero_stats(&svc->stats); 1839 return 0; 1840 } 1841 1842 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 1843 { 1844 int idx; 1845 struct ip_vs_service *svc; 1846 1847 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1848 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 1849 if (svc->ipvs == ipvs) 1850 ip_vs_zero_service(svc); 1851 } 1852 } 1853 1854 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1855 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 1856 if (svc->ipvs == ipvs) 1857 ip_vs_zero_service(svc); 1858 } 1859 } 1860 1861 ip_vs_zero_stats(&ipvs->tot_stats->s); 1862 return 0; 1863 } 1864 1865 #ifdef CONFIG_SYSCTL 1866 1867 static int 1868 proc_do_defense_mode(struct ctl_table *table, int write, 1869 void *buffer, size_t *lenp, loff_t *ppos) 1870 { 1871 struct netns_ipvs *ipvs = table->extra2; 1872 int *valp = table->data; 1873 int val = *valp; 1874 int rc; 1875 1876 struct ctl_table tmp = { 1877 .data = &val, 1878 .maxlen = sizeof(int), 1879 .mode = table->mode, 1880 }; 1881 1882 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 1883 if (write && (*valp != val)) { 1884 if (val < 0 || val > 3) { 1885 rc = -EINVAL; 1886 } else { 1887 *valp = val; 1888 update_defense_level(ipvs); 1889 } 1890 } 1891 return rc; 1892 } 1893 1894 static int 1895 proc_do_sync_threshold(struct ctl_table *table, int write, 1896 void *buffer, size_t *lenp, loff_t *ppos) 1897 { 1898 int *valp = table->data; 1899 int val[2]; 1900 int rc; 1901 struct ctl_table tmp = { 1902 .data = &val, 1903 .maxlen = table->maxlen, 1904 .mode = table->mode, 1905 }; 1906 1907 memcpy(val, valp, sizeof(val)); 1908 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 1909 if (write) { 1910 if (val[0] < 0 || val[1] < 0 || 1911 (val[0] >= val[1] && val[1])) 1912 rc = -EINVAL; 1913 else 1914 memcpy(valp, val, sizeof(val)); 1915 } 1916 return rc; 1917 } 1918 1919 static int 1920 proc_do_sync_ports(struct ctl_table *table, int write, 1921 void *buffer, size_t *lenp, loff_t *ppos) 1922 { 1923 int *valp = table->data; 1924 int val = *valp; 1925 int rc; 1926 1927 struct ctl_table tmp = { 1928 .data = &val, 1929 .maxlen = sizeof(int), 1930 .mode = table->mode, 1931 }; 1932 1933 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 1934 if (write && (*valp != val)) { 1935 if (val < 1 || !is_power_of_2(val)) 1936 rc = -EINVAL; 1937 else 1938 *valp = val; 1939 } 1940 return rc; 1941 } 1942 1943 static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) 1944 { 1945 struct netns_ipvs *ipvs = table->extra2; 1946 cpumask_var_t *valp = table->data; 1947 cpumask_var_t newmask; 1948 int ret; 1949 1950 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) 1951 return -ENOMEM; 1952 1953 ret = cpulist_parse(buffer, newmask); 1954 if (ret) 1955 goto out; 1956 1957 mutex_lock(&ipvs->est_mutex); 1958 1959 if (!ipvs->est_cpulist_valid) { 1960 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { 1961 ret = -ENOMEM; 1962 goto unlock; 1963 } 1964 ipvs->est_cpulist_valid = 1; 1965 } 1966 cpumask_and(newmask, newmask, ¤t->cpus_mask); 1967 cpumask_copy(*valp, newmask); 1968 /* est_max_threads may depend on cpulist size */ 1969 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 1970 ipvs->est_calc_phase = 1; 1971 ip_vs_est_reload_start(ipvs); 1972 1973 unlock: 1974 mutex_unlock(&ipvs->est_mutex); 1975 1976 out: 1977 free_cpumask_var(newmask); 1978 return ret; 1979 } 1980 1981 static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, 1982 size_t size) 1983 { 1984 struct netns_ipvs *ipvs = table->extra2; 1985 cpumask_var_t *valp = table->data; 1986 struct cpumask *mask; 1987 int ret; 1988 1989 mutex_lock(&ipvs->est_mutex); 1990 1991 if (ipvs->est_cpulist_valid) 1992 mask = *valp; 1993 else 1994 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); 1995 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); 1996 1997 mutex_unlock(&ipvs->est_mutex); 1998 1999 return ret; 2000 } 2001 2002 static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, 2003 void *buffer, size_t *lenp, loff_t *ppos) 2004 { 2005 int ret; 2006 2007 /* Ignore both read and write(append) if *ppos not 0 */ 2008 if (*ppos || !*lenp) { 2009 *lenp = 0; 2010 return 0; 2011 } 2012 if (write) { 2013 /* proc_sys_call_handler() appends terminator */ 2014 ret = ipvs_proc_est_cpumask_set(table, buffer); 2015 if (ret >= 0) 2016 *ppos += *lenp; 2017 } else { 2018 /* proc_sys_call_handler() allocates 1 byte for terminator */ 2019 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); 2020 if (ret >= 0) { 2021 *lenp = ret; 2022 *ppos += *lenp; 2023 ret = 0; 2024 } 2025 } 2026 return ret; 2027 } 2028 2029 static int ipvs_proc_est_nice(struct ctl_table *table, int write, 2030 void *buffer, size_t *lenp, loff_t *ppos) 2031 { 2032 struct netns_ipvs *ipvs = table->extra2; 2033 int *valp = table->data; 2034 int val = *valp; 2035 int ret; 2036 2037 struct ctl_table tmp_table = { 2038 .data = &val, 2039 .maxlen = sizeof(int), 2040 .mode = table->mode, 2041 }; 2042 2043 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2044 if (write && ret >= 0) { 2045 if (val < MIN_NICE || val > MAX_NICE) { 2046 ret = -EINVAL; 2047 } else { 2048 mutex_lock(&ipvs->est_mutex); 2049 if (*valp != val) { 2050 *valp = val; 2051 ip_vs_est_reload_start(ipvs); 2052 } 2053 mutex_unlock(&ipvs->est_mutex); 2054 } 2055 } 2056 return ret; 2057 } 2058 2059 static int ipvs_proc_run_estimation(struct ctl_table *table, int write, 2060 void *buffer, size_t *lenp, loff_t *ppos) 2061 { 2062 struct netns_ipvs *ipvs = table->extra2; 2063 int *valp = table->data; 2064 int val = *valp; 2065 int ret; 2066 2067 struct ctl_table tmp_table = { 2068 .data = &val, 2069 .maxlen = sizeof(int), 2070 .mode = table->mode, 2071 }; 2072 2073 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2074 if (write && ret >= 0) { 2075 mutex_lock(&ipvs->est_mutex); 2076 if (*valp != val) { 2077 *valp = val; 2078 ip_vs_est_reload_start(ipvs); 2079 } 2080 mutex_unlock(&ipvs->est_mutex); 2081 } 2082 return ret; 2083 } 2084 2085 /* 2086 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 2087 * Do not change order or insert new entries without 2088 * align with netns init in ip_vs_control_net_init() 2089 */ 2090 2091 static struct ctl_table vs_vars[] = { 2092 { 2093 .procname = "amemthresh", 2094 .maxlen = sizeof(int), 2095 .mode = 0644, 2096 .proc_handler = proc_dointvec, 2097 }, 2098 { 2099 .procname = "am_droprate", 2100 .maxlen = sizeof(int), 2101 .mode = 0644, 2102 .proc_handler = proc_dointvec, 2103 }, 2104 { 2105 .procname = "drop_entry", 2106 .maxlen = sizeof(int), 2107 .mode = 0644, 2108 .proc_handler = proc_do_defense_mode, 2109 }, 2110 { 2111 .procname = "drop_packet", 2112 .maxlen = sizeof(int), 2113 .mode = 0644, 2114 .proc_handler = proc_do_defense_mode, 2115 }, 2116 #ifdef CONFIG_IP_VS_NFCT 2117 { 2118 .procname = "conntrack", 2119 .maxlen = sizeof(int), 2120 .mode = 0644, 2121 .proc_handler = &proc_dointvec, 2122 }, 2123 #endif 2124 { 2125 .procname = "secure_tcp", 2126 .maxlen = sizeof(int), 2127 .mode = 0644, 2128 .proc_handler = proc_do_defense_mode, 2129 }, 2130 { 2131 .procname = "snat_reroute", 2132 .maxlen = sizeof(int), 2133 .mode = 0644, 2134 .proc_handler = &proc_dointvec, 2135 }, 2136 { 2137 .procname = "sync_version", 2138 .maxlen = sizeof(int), 2139 .mode = 0644, 2140 .proc_handler = proc_dointvec_minmax, 2141 .extra1 = SYSCTL_ZERO, 2142 .extra2 = SYSCTL_ONE, 2143 }, 2144 { 2145 .procname = "sync_ports", 2146 .maxlen = sizeof(int), 2147 .mode = 0644, 2148 .proc_handler = proc_do_sync_ports, 2149 }, 2150 { 2151 .procname = "sync_persist_mode", 2152 .maxlen = sizeof(int), 2153 .mode = 0644, 2154 .proc_handler = proc_dointvec, 2155 }, 2156 { 2157 .procname = "sync_qlen_max", 2158 .maxlen = sizeof(unsigned long), 2159 .mode = 0644, 2160 .proc_handler = proc_doulongvec_minmax, 2161 }, 2162 { 2163 .procname = "sync_sock_size", 2164 .maxlen = sizeof(int), 2165 .mode = 0644, 2166 .proc_handler = proc_dointvec, 2167 }, 2168 { 2169 .procname = "cache_bypass", 2170 .maxlen = sizeof(int), 2171 .mode = 0644, 2172 .proc_handler = proc_dointvec, 2173 }, 2174 { 2175 .procname = "expire_nodest_conn", 2176 .maxlen = sizeof(int), 2177 .mode = 0644, 2178 .proc_handler = proc_dointvec, 2179 }, 2180 { 2181 .procname = "sloppy_tcp", 2182 .maxlen = sizeof(int), 2183 .mode = 0644, 2184 .proc_handler = proc_dointvec, 2185 }, 2186 { 2187 .procname = "sloppy_sctp", 2188 .maxlen = sizeof(int), 2189 .mode = 0644, 2190 .proc_handler = proc_dointvec, 2191 }, 2192 { 2193 .procname = "expire_quiescent_template", 2194 .maxlen = sizeof(int), 2195 .mode = 0644, 2196 .proc_handler = proc_dointvec, 2197 }, 2198 { 2199 .procname = "sync_threshold", 2200 .maxlen = 2201 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 2202 .mode = 0644, 2203 .proc_handler = proc_do_sync_threshold, 2204 }, 2205 { 2206 .procname = "sync_refresh_period", 2207 .maxlen = sizeof(int), 2208 .mode = 0644, 2209 .proc_handler = proc_dointvec_jiffies, 2210 }, 2211 { 2212 .procname = "sync_retries", 2213 .maxlen = sizeof(int), 2214 .mode = 0644, 2215 .proc_handler = proc_dointvec_minmax, 2216 .extra1 = SYSCTL_ZERO, 2217 .extra2 = SYSCTL_THREE, 2218 }, 2219 { 2220 .procname = "nat_icmp_send", 2221 .maxlen = sizeof(int), 2222 .mode = 0644, 2223 .proc_handler = proc_dointvec, 2224 }, 2225 { 2226 .procname = "pmtu_disc", 2227 .maxlen = sizeof(int), 2228 .mode = 0644, 2229 .proc_handler = proc_dointvec, 2230 }, 2231 { 2232 .procname = "backup_only", 2233 .maxlen = sizeof(int), 2234 .mode = 0644, 2235 .proc_handler = proc_dointvec, 2236 }, 2237 { 2238 .procname = "conn_reuse_mode", 2239 .maxlen = sizeof(int), 2240 .mode = 0644, 2241 .proc_handler = proc_dointvec, 2242 }, 2243 { 2244 .procname = "schedule_icmp", 2245 .maxlen = sizeof(int), 2246 .mode = 0644, 2247 .proc_handler = proc_dointvec, 2248 }, 2249 { 2250 .procname = "ignore_tunneled", 2251 .maxlen = sizeof(int), 2252 .mode = 0644, 2253 .proc_handler = proc_dointvec, 2254 }, 2255 { 2256 .procname = "run_estimation", 2257 .maxlen = sizeof(int), 2258 .mode = 0644, 2259 .proc_handler = ipvs_proc_run_estimation, 2260 }, 2261 { 2262 .procname = "est_cpulist", 2263 .maxlen = NR_CPUS, /* unused */ 2264 .mode = 0644, 2265 .proc_handler = ipvs_proc_est_cpulist, 2266 }, 2267 { 2268 .procname = "est_nice", 2269 .maxlen = sizeof(int), 2270 .mode = 0644, 2271 .proc_handler = ipvs_proc_est_nice, 2272 }, 2273 #ifdef CONFIG_IP_VS_DEBUG 2274 { 2275 .procname = "debug_level", 2276 .data = &sysctl_ip_vs_debug_level, 2277 .maxlen = sizeof(int), 2278 .mode = 0644, 2279 .proc_handler = proc_dointvec, 2280 }, 2281 #endif 2282 { } 2283 }; 2284 2285 #endif 2286 2287 #ifdef CONFIG_PROC_FS 2288 2289 struct ip_vs_iter { 2290 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2291 struct hlist_head *table; 2292 int bucket; 2293 }; 2294 2295 /* 2296 * Write the contents of the VS rule table to a PROCfs file. 2297 * (It is kept just for backward compatibility) 2298 */ 2299 static inline const char *ip_vs_fwd_name(unsigned int flags) 2300 { 2301 switch (flags & IP_VS_CONN_F_FWD_MASK) { 2302 case IP_VS_CONN_F_LOCALNODE: 2303 return "Local"; 2304 case IP_VS_CONN_F_TUNNEL: 2305 return "Tunnel"; 2306 case IP_VS_CONN_F_DROUTE: 2307 return "Route"; 2308 default: 2309 return "Masq"; 2310 } 2311 } 2312 2313 2314 /* Get the Nth entry in the two lists */ 2315 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2316 { 2317 struct net *net = seq_file_net(seq); 2318 struct netns_ipvs *ipvs = net_ipvs(net); 2319 struct ip_vs_iter *iter = seq->private; 2320 int idx; 2321 struct ip_vs_service *svc; 2322 2323 /* look in hash by protocol */ 2324 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2325 hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { 2326 if ((svc->ipvs == ipvs) && pos-- == 0) { 2327 iter->table = ip_vs_svc_table; 2328 iter->bucket = idx; 2329 return svc; 2330 } 2331 } 2332 } 2333 2334 /* keep looking in fwmark */ 2335 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2336 hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], 2337 f_list) { 2338 if ((svc->ipvs == ipvs) && pos-- == 0) { 2339 iter->table = ip_vs_svc_fwm_table; 2340 iter->bucket = idx; 2341 return svc; 2342 } 2343 } 2344 } 2345 2346 return NULL; 2347 } 2348 2349 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2350 __acquires(RCU) 2351 { 2352 rcu_read_lock(); 2353 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2354 } 2355 2356 2357 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2358 { 2359 struct hlist_node *e; 2360 struct ip_vs_iter *iter; 2361 struct ip_vs_service *svc; 2362 2363 ++*pos; 2364 if (v == SEQ_START_TOKEN) 2365 return ip_vs_info_array(seq,0); 2366 2367 svc = v; 2368 iter = seq->private; 2369 2370 if (iter->table == ip_vs_svc_table) { 2371 /* next service in table hashed by protocol */ 2372 e = rcu_dereference(hlist_next_rcu(&svc->s_list)); 2373 if (e) 2374 return hlist_entry(e, struct ip_vs_service, s_list); 2375 2376 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { 2377 hlist_for_each_entry_rcu(svc, 2378 &ip_vs_svc_table[iter->bucket], 2379 s_list) { 2380 return svc; 2381 } 2382 } 2383 2384 iter->table = ip_vs_svc_fwm_table; 2385 iter->bucket = -1; 2386 goto scan_fwmark; 2387 } 2388 2389 /* next service in hashed by fwmark */ 2390 e = rcu_dereference(hlist_next_rcu(&svc->f_list)); 2391 if (e) 2392 return hlist_entry(e, struct ip_vs_service, f_list); 2393 2394 scan_fwmark: 2395 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { 2396 hlist_for_each_entry_rcu(svc, 2397 &ip_vs_svc_fwm_table[iter->bucket], 2398 f_list) 2399 return svc; 2400 } 2401 2402 return NULL; 2403 } 2404 2405 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 2406 __releases(RCU) 2407 { 2408 rcu_read_unlock(); 2409 } 2410 2411 2412 static int ip_vs_info_seq_show(struct seq_file *seq, void *v) 2413 { 2414 if (v == SEQ_START_TOKEN) { 2415 seq_printf(seq, 2416 "IP Virtual Server version %d.%d.%d (size=%d)\n", 2417 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); 2418 seq_puts(seq, 2419 "Prot LocalAddress:Port Scheduler Flags\n"); 2420 seq_puts(seq, 2421 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); 2422 } else { 2423 struct net *net = seq_file_net(seq); 2424 struct netns_ipvs *ipvs = net_ipvs(net); 2425 const struct ip_vs_service *svc = v; 2426 const struct ip_vs_iter *iter = seq->private; 2427 const struct ip_vs_dest *dest; 2428 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); 2429 char *sched_name = sched ? sched->name : "none"; 2430 2431 if (svc->ipvs != ipvs) 2432 return 0; 2433 if (iter->table == ip_vs_svc_table) { 2434 #ifdef CONFIG_IP_VS_IPV6 2435 if (svc->af == AF_INET6) 2436 seq_printf(seq, "%s [%pI6]:%04X %s ", 2437 ip_vs_proto_name(svc->protocol), 2438 &svc->addr.in6, 2439 ntohs(svc->port), 2440 sched_name); 2441 else 2442 #endif 2443 seq_printf(seq, "%s %08X:%04X %s %s ", 2444 ip_vs_proto_name(svc->protocol), 2445 ntohl(svc->addr.ip), 2446 ntohs(svc->port), 2447 sched_name, 2448 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2449 } else { 2450 seq_printf(seq, "FWM %08X %s %s", 2451 svc->fwmark, sched_name, 2452 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2453 } 2454 2455 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 2456 seq_printf(seq, "persistent %d %08X\n", 2457 svc->timeout, 2458 ntohl(svc->netmask)); 2459 else 2460 seq_putc(seq, '\n'); 2461 2462 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 2463 #ifdef CONFIG_IP_VS_IPV6 2464 if (dest->af == AF_INET6) 2465 seq_printf(seq, 2466 " -> [%pI6]:%04X" 2467 " %-7s %-6d %-10d %-10d\n", 2468 &dest->addr.in6, 2469 ntohs(dest->port), 2470 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2471 atomic_read(&dest->weight), 2472 atomic_read(&dest->activeconns), 2473 atomic_read(&dest->inactconns)); 2474 else 2475 #endif 2476 seq_printf(seq, 2477 " -> %08X:%04X " 2478 "%-7s %-6d %-10d %-10d\n", 2479 ntohl(dest->addr.ip), 2480 ntohs(dest->port), 2481 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2482 atomic_read(&dest->weight), 2483 atomic_read(&dest->activeconns), 2484 atomic_read(&dest->inactconns)); 2485 2486 } 2487 } 2488 return 0; 2489 } 2490 2491 static const struct seq_operations ip_vs_info_seq_ops = { 2492 .start = ip_vs_info_seq_start, 2493 .next = ip_vs_info_seq_next, 2494 .stop = ip_vs_info_seq_stop, 2495 .show = ip_vs_info_seq_show, 2496 }; 2497 2498 static int ip_vs_stats_show(struct seq_file *seq, void *v) 2499 { 2500 struct net *net = seq_file_single_net(seq); 2501 struct ip_vs_kstats show; 2502 2503 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2504 seq_puts(seq, 2505 " Total Incoming Outgoing Incoming Outgoing\n"); 2506 seq_puts(seq, 2507 " Conns Packets Packets Bytes Bytes\n"); 2508 2509 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); 2510 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", 2511 (unsigned long long)show.conns, 2512 (unsigned long long)show.inpkts, 2513 (unsigned long long)show.outpkts, 2514 (unsigned long long)show.inbytes, 2515 (unsigned long long)show.outbytes); 2516 2517 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ 2518 seq_puts(seq, 2519 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 2520 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", 2521 (unsigned long long)show.cps, 2522 (unsigned long long)show.inpps, 2523 (unsigned long long)show.outpps, 2524 (unsigned long long)show.inbps, 2525 (unsigned long long)show.outbps); 2526 2527 return 0; 2528 } 2529 2530 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) 2531 { 2532 struct net *net = seq_file_single_net(seq); 2533 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; 2534 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; 2535 struct ip_vs_kstats kstats; 2536 int i; 2537 2538 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2539 seq_puts(seq, 2540 " Total Incoming Outgoing Incoming Outgoing\n"); 2541 seq_puts(seq, 2542 "CPU Conns Packets Packets Bytes Bytes\n"); 2543 2544 for_each_possible_cpu(i) { 2545 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); 2546 unsigned int start; 2547 u64 conns, inpkts, outpkts, inbytes, outbytes; 2548 2549 do { 2550 start = u64_stats_fetch_begin(&u->syncp); 2551 conns = u64_stats_read(&u->cnt.conns); 2552 inpkts = u64_stats_read(&u->cnt.inpkts); 2553 outpkts = u64_stats_read(&u->cnt.outpkts); 2554 inbytes = u64_stats_read(&u->cnt.inbytes); 2555 outbytes = u64_stats_read(&u->cnt.outbytes); 2556 } while (u64_stats_fetch_retry(&u->syncp, start)); 2557 2558 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", 2559 i, (u64)conns, (u64)inpkts, 2560 (u64)outpkts, (u64)inbytes, 2561 (u64)outbytes); 2562 } 2563 2564 ip_vs_copy_stats(&kstats, tot_stats); 2565 2566 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", 2567 (unsigned long long)kstats.conns, 2568 (unsigned long long)kstats.inpkts, 2569 (unsigned long long)kstats.outpkts, 2570 (unsigned long long)kstats.inbytes, 2571 (unsigned long long)kstats.outbytes); 2572 2573 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2574 seq_puts(seq, 2575 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 2576 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", 2577 kstats.cps, 2578 kstats.inpps, 2579 kstats.outpps, 2580 kstats.inbps, 2581 kstats.outbps); 2582 2583 return 0; 2584 } 2585 #endif 2586 2587 /* 2588 * Set timeout values for tcp tcpfin udp in the timeout_table. 2589 */ 2590 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 2591 { 2592 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 2593 struct ip_vs_proto_data *pd; 2594 #endif 2595 2596 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 2597 u->tcp_timeout, 2598 u->tcp_fin_timeout, 2599 u->udp_timeout); 2600 2601 #ifdef CONFIG_IP_VS_PROTO_TCP 2602 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || 2603 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { 2604 return -EINVAL; 2605 } 2606 #endif 2607 2608 #ifdef CONFIG_IP_VS_PROTO_UDP 2609 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) 2610 return -EINVAL; 2611 #endif 2612 2613 #ifdef CONFIG_IP_VS_PROTO_TCP 2614 if (u->tcp_timeout) { 2615 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 2616 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] 2617 = u->tcp_timeout * HZ; 2618 } 2619 2620 if (u->tcp_fin_timeout) { 2621 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 2622 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] 2623 = u->tcp_fin_timeout * HZ; 2624 } 2625 #endif 2626 2627 #ifdef CONFIG_IP_VS_PROTO_UDP 2628 if (u->udp_timeout) { 2629 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 2630 pd->timeout_table[IP_VS_UDP_S_NORMAL] 2631 = u->udp_timeout * HZ; 2632 } 2633 #endif 2634 return 0; 2635 } 2636 2637 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) 2638 2639 struct ip_vs_svcdest_user { 2640 struct ip_vs_service_user s; 2641 struct ip_vs_dest_user d; 2642 }; 2643 2644 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { 2645 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), 2646 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), 2647 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), 2648 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), 2649 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), 2650 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), 2651 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 2652 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), 2653 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), 2654 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), 2655 }; 2656 2657 union ip_vs_set_arglen { 2658 struct ip_vs_service_user field_IP_VS_SO_SET_ADD; 2659 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; 2660 struct ip_vs_service_user field_IP_VS_SO_SET_DEL; 2661 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; 2662 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; 2663 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; 2664 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; 2665 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; 2666 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; 2667 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; 2668 }; 2669 2670 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) 2671 2672 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 2673 struct ip_vs_service_user *usvc_compat) 2674 { 2675 memset(usvc, 0, sizeof(*usvc)); 2676 2677 usvc->af = AF_INET; 2678 usvc->protocol = usvc_compat->protocol; 2679 usvc->addr.ip = usvc_compat->addr; 2680 usvc->port = usvc_compat->port; 2681 usvc->fwmark = usvc_compat->fwmark; 2682 2683 /* Deep copy of sched_name is not needed here */ 2684 usvc->sched_name = usvc_compat->sched_name; 2685 2686 usvc->flags = usvc_compat->flags; 2687 usvc->timeout = usvc_compat->timeout; 2688 usvc->netmask = usvc_compat->netmask; 2689 } 2690 2691 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 2692 struct ip_vs_dest_user *udest_compat) 2693 { 2694 memset(udest, 0, sizeof(*udest)); 2695 2696 udest->addr.ip = udest_compat->addr; 2697 udest->port = udest_compat->port; 2698 udest->conn_flags = udest_compat->conn_flags; 2699 udest->weight = udest_compat->weight; 2700 udest->u_threshold = udest_compat->u_threshold; 2701 udest->l_threshold = udest_compat->l_threshold; 2702 udest->af = AF_INET; 2703 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; 2704 } 2705 2706 static int 2707 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) 2708 { 2709 struct net *net = sock_net(sk); 2710 int ret; 2711 unsigned char arg[MAX_SET_ARGLEN]; 2712 struct ip_vs_service_user *usvc_compat; 2713 struct ip_vs_service_user_kern usvc; 2714 struct ip_vs_service *svc; 2715 struct ip_vs_dest_user *udest_compat; 2716 struct ip_vs_dest_user_kern udest; 2717 struct netns_ipvs *ipvs = net_ipvs(net); 2718 2719 BUILD_BUG_ON(sizeof(arg) > 255); 2720 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2721 return -EPERM; 2722 2723 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) 2724 return -EINVAL; 2725 if (len != set_arglen[CMDID(cmd)]) { 2726 IP_VS_DBG(1, "set_ctl: len %u != %u\n", 2727 len, set_arglen[CMDID(cmd)]); 2728 return -EINVAL; 2729 } 2730 2731 if (copy_from_sockptr(arg, ptr, len) != 0) 2732 return -EFAULT; 2733 2734 /* Handle daemons since they have another lock */ 2735 if (cmd == IP_VS_SO_SET_STARTDAEMON || 2736 cmd == IP_VS_SO_SET_STOPDAEMON) { 2737 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 2738 2739 if (cmd == IP_VS_SO_SET_STARTDAEMON) { 2740 struct ipvs_sync_daemon_cfg cfg; 2741 2742 memset(&cfg, 0, sizeof(cfg)); 2743 ret = -EINVAL; 2744 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, 2745 sizeof(cfg.mcast_ifn)) <= 0) 2746 return ret; 2747 cfg.syncid = dm->syncid; 2748 ret = start_sync_thread(ipvs, &cfg, dm->state); 2749 } else { 2750 ret = stop_sync_thread(ipvs, dm->state); 2751 } 2752 return ret; 2753 } 2754 2755 mutex_lock(&__ip_vs_mutex); 2756 if (cmd == IP_VS_SO_SET_FLUSH) { 2757 /* Flush the virtual service */ 2758 ret = ip_vs_flush(ipvs, false); 2759 goto out_unlock; 2760 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 2761 /* Set timeout values for (tcp tcpfin udp) */ 2762 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); 2763 goto out_unlock; 2764 } else if (!len) { 2765 /* No more commands with len == 0 below */ 2766 ret = -EINVAL; 2767 goto out_unlock; 2768 } 2769 2770 usvc_compat = (struct ip_vs_service_user *)arg; 2771 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); 2772 2773 /* We only use the new structs internally, so copy userspace compat 2774 * structs to extended internal versions */ 2775 ip_vs_copy_usvc_compat(&usvc, usvc_compat); 2776 ip_vs_copy_udest_compat(&udest, udest_compat); 2777 2778 if (cmd == IP_VS_SO_SET_ZERO) { 2779 /* if no service address is set, zero counters in all */ 2780 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 2781 ret = ip_vs_zero_all(ipvs); 2782 goto out_unlock; 2783 } 2784 } 2785 2786 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && 2787 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == 2788 IP_VS_SCHEDNAME_MAXLEN) { 2789 ret = -EINVAL; 2790 goto out_unlock; 2791 } 2792 2793 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ 2794 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && 2795 usvc.protocol != IPPROTO_SCTP) { 2796 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", 2797 usvc.protocol, &usvc.addr.ip, 2798 ntohs(usvc.port)); 2799 ret = -EFAULT; 2800 goto out_unlock; 2801 } 2802 2803 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 2804 rcu_read_lock(); 2805 if (usvc.fwmark == 0) 2806 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, 2807 &usvc.addr, usvc.port); 2808 else 2809 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); 2810 rcu_read_unlock(); 2811 2812 if (cmd != IP_VS_SO_SET_ADD 2813 && (svc == NULL || svc->protocol != usvc.protocol)) { 2814 ret = -ESRCH; 2815 goto out_unlock; 2816 } 2817 2818 switch (cmd) { 2819 case IP_VS_SO_SET_ADD: 2820 if (svc != NULL) 2821 ret = -EEXIST; 2822 else 2823 ret = ip_vs_add_service(ipvs, &usvc, &svc); 2824 break; 2825 case IP_VS_SO_SET_EDIT: 2826 ret = ip_vs_edit_service(svc, &usvc); 2827 break; 2828 case IP_VS_SO_SET_DEL: 2829 ret = ip_vs_del_service(svc); 2830 if (!ret) 2831 goto out_unlock; 2832 break; 2833 case IP_VS_SO_SET_ZERO: 2834 ret = ip_vs_zero_service(svc); 2835 break; 2836 case IP_VS_SO_SET_ADDDEST: 2837 ret = ip_vs_add_dest(svc, &udest); 2838 break; 2839 case IP_VS_SO_SET_EDITDEST: 2840 ret = ip_vs_edit_dest(svc, &udest); 2841 break; 2842 case IP_VS_SO_SET_DELDEST: 2843 ret = ip_vs_del_dest(svc, &udest); 2844 break; 2845 default: 2846 WARN_ON_ONCE(1); 2847 ret = -EINVAL; 2848 break; 2849 } 2850 2851 out_unlock: 2852 mutex_unlock(&__ip_vs_mutex); 2853 return ret; 2854 } 2855 2856 2857 static void 2858 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 2859 { 2860 struct ip_vs_scheduler *sched; 2861 struct ip_vs_kstats kstats; 2862 char *sched_name; 2863 2864 sched = rcu_dereference_protected(src->scheduler, 1); 2865 sched_name = sched ? sched->name : "none"; 2866 dst->protocol = src->protocol; 2867 dst->addr = src->addr.ip; 2868 dst->port = src->port; 2869 dst->fwmark = src->fwmark; 2870 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); 2871 dst->flags = src->flags; 2872 dst->timeout = src->timeout / HZ; 2873 dst->netmask = src->netmask; 2874 dst->num_dests = src->num_dests; 2875 ip_vs_copy_stats(&kstats, &src->stats); 2876 ip_vs_export_stats_user(&dst->stats, &kstats); 2877 } 2878 2879 static inline int 2880 __ip_vs_get_service_entries(struct netns_ipvs *ipvs, 2881 const struct ip_vs_get_services *get, 2882 struct ip_vs_get_services __user *uptr) 2883 { 2884 int idx, count=0; 2885 struct ip_vs_service *svc; 2886 struct ip_vs_service_entry entry; 2887 int ret = 0; 2888 2889 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2890 hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2891 /* Only expose IPv4 entries to old interface */ 2892 if (svc->af != AF_INET || (svc->ipvs != ipvs)) 2893 continue; 2894 2895 if (count >= get->num_services) 2896 goto out; 2897 memset(&entry, 0, sizeof(entry)); 2898 ip_vs_copy_service(&entry, svc); 2899 if (copy_to_user(&uptr->entrytable[count], 2900 &entry, sizeof(entry))) { 2901 ret = -EFAULT; 2902 goto out; 2903 } 2904 count++; 2905 } 2906 } 2907 2908 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2909 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2910 /* Only expose IPv4 entries to old interface */ 2911 if (svc->af != AF_INET || (svc->ipvs != ipvs)) 2912 continue; 2913 2914 if (count >= get->num_services) 2915 goto out; 2916 memset(&entry, 0, sizeof(entry)); 2917 ip_vs_copy_service(&entry, svc); 2918 if (copy_to_user(&uptr->entrytable[count], 2919 &entry, sizeof(entry))) { 2920 ret = -EFAULT; 2921 goto out; 2922 } 2923 count++; 2924 } 2925 } 2926 out: 2927 return ret; 2928 } 2929 2930 static inline int 2931 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, 2932 struct ip_vs_get_dests __user *uptr) 2933 { 2934 struct ip_vs_service *svc; 2935 union nf_inet_addr addr = { .ip = get->addr }; 2936 int ret = 0; 2937 2938 rcu_read_lock(); 2939 if (get->fwmark) 2940 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); 2941 else 2942 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, 2943 get->port); 2944 rcu_read_unlock(); 2945 2946 if (svc) { 2947 int count = 0; 2948 struct ip_vs_dest *dest; 2949 struct ip_vs_dest_entry entry; 2950 struct ip_vs_kstats kstats; 2951 2952 memset(&entry, 0, sizeof(entry)); 2953 list_for_each_entry(dest, &svc->destinations, n_list) { 2954 if (count >= get->num_dests) 2955 break; 2956 2957 /* Cannot expose heterogeneous members via sockopt 2958 * interface 2959 */ 2960 if (dest->af != svc->af) 2961 continue; 2962 2963 entry.addr = dest->addr.ip; 2964 entry.port = dest->port; 2965 entry.conn_flags = atomic_read(&dest->conn_flags); 2966 entry.weight = atomic_read(&dest->weight); 2967 entry.u_threshold = dest->u_threshold; 2968 entry.l_threshold = dest->l_threshold; 2969 entry.activeconns = atomic_read(&dest->activeconns); 2970 entry.inactconns = atomic_read(&dest->inactconns); 2971 entry.persistconns = atomic_read(&dest->persistconns); 2972 ip_vs_copy_stats(&kstats, &dest->stats); 2973 ip_vs_export_stats_user(&entry.stats, &kstats); 2974 if (copy_to_user(&uptr->entrytable[count], 2975 &entry, sizeof(entry))) { 2976 ret = -EFAULT; 2977 break; 2978 } 2979 count++; 2980 } 2981 } else 2982 ret = -ESRCH; 2983 return ret; 2984 } 2985 2986 static inline void 2987 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 2988 { 2989 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 2990 struct ip_vs_proto_data *pd; 2991 #endif 2992 2993 memset(u, 0, sizeof (*u)); 2994 2995 #ifdef CONFIG_IP_VS_PROTO_TCP 2996 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 2997 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 2998 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; 2999 #endif 3000 #ifdef CONFIG_IP_VS_PROTO_UDP 3001 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3002 u->udp_timeout = 3003 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 3004 #endif 3005 } 3006 3007 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { 3008 [CMDID(IP_VS_SO_GET_VERSION)] = 64, 3009 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), 3010 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), 3011 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), 3012 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), 3013 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3014 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), 3015 }; 3016 3017 union ip_vs_get_arglen { 3018 char field_IP_VS_SO_GET_VERSION[64]; 3019 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; 3020 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; 3021 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; 3022 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; 3023 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; 3024 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; 3025 }; 3026 3027 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) 3028 3029 static int 3030 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) 3031 { 3032 unsigned char arg[MAX_GET_ARGLEN]; 3033 int ret = 0; 3034 unsigned int copylen; 3035 struct net *net = sock_net(sk); 3036 struct netns_ipvs *ipvs = net_ipvs(net); 3037 3038 BUG_ON(!net); 3039 BUILD_BUG_ON(sizeof(arg) > 255); 3040 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3041 return -EPERM; 3042 3043 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) 3044 return -EINVAL; 3045 3046 copylen = get_arglen[CMDID(cmd)]; 3047 if (*len < (int) copylen) { 3048 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); 3049 return -EINVAL; 3050 } 3051 3052 if (copy_from_user(arg, user, copylen) != 0) 3053 return -EFAULT; 3054 /* 3055 * Handle daemons first since it has its own locking 3056 */ 3057 if (cmd == IP_VS_SO_GET_DAEMON) { 3058 struct ip_vs_daemon_user d[2]; 3059 3060 memset(&d, 0, sizeof(d)); 3061 mutex_lock(&ipvs->sync_mutex); 3062 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 3063 d[0].state = IP_VS_STATE_MASTER; 3064 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 3065 sizeof(d[0].mcast_ifn)); 3066 d[0].syncid = ipvs->mcfg.syncid; 3067 } 3068 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 3069 d[1].state = IP_VS_STATE_BACKUP; 3070 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 3071 sizeof(d[1].mcast_ifn)); 3072 d[1].syncid = ipvs->bcfg.syncid; 3073 } 3074 if (copy_to_user(user, &d, sizeof(d)) != 0) 3075 ret = -EFAULT; 3076 mutex_unlock(&ipvs->sync_mutex); 3077 return ret; 3078 } 3079 3080 mutex_lock(&__ip_vs_mutex); 3081 switch (cmd) { 3082 case IP_VS_SO_GET_VERSION: 3083 { 3084 char buf[64]; 3085 3086 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", 3087 NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); 3088 if (copy_to_user(user, buf, strlen(buf)+1) != 0) { 3089 ret = -EFAULT; 3090 goto out; 3091 } 3092 *len = strlen(buf)+1; 3093 } 3094 break; 3095 3096 case IP_VS_SO_GET_INFO: 3097 { 3098 struct ip_vs_getinfo info; 3099 info.version = IP_VS_VERSION_CODE; 3100 info.size = ip_vs_conn_tab_size; 3101 info.num_services = ipvs->num_services; 3102 if (copy_to_user(user, &info, sizeof(info)) != 0) 3103 ret = -EFAULT; 3104 } 3105 break; 3106 3107 case IP_VS_SO_GET_SERVICES: 3108 { 3109 struct ip_vs_get_services *get; 3110 int size; 3111 3112 get = (struct ip_vs_get_services *)arg; 3113 size = struct_size(get, entrytable, get->num_services); 3114 if (*len != size) { 3115 pr_err("length: %u != %u\n", *len, size); 3116 ret = -EINVAL; 3117 goto out; 3118 } 3119 ret = __ip_vs_get_service_entries(ipvs, get, user); 3120 } 3121 break; 3122 3123 case IP_VS_SO_GET_SERVICE: 3124 { 3125 struct ip_vs_service_entry *entry; 3126 struct ip_vs_service *svc; 3127 union nf_inet_addr addr; 3128 3129 entry = (struct ip_vs_service_entry *)arg; 3130 addr.ip = entry->addr; 3131 rcu_read_lock(); 3132 if (entry->fwmark) 3133 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); 3134 else 3135 svc = __ip_vs_service_find(ipvs, AF_INET, 3136 entry->protocol, &addr, 3137 entry->port); 3138 rcu_read_unlock(); 3139 if (svc) { 3140 ip_vs_copy_service(entry, svc); 3141 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 3142 ret = -EFAULT; 3143 } else 3144 ret = -ESRCH; 3145 } 3146 break; 3147 3148 case IP_VS_SO_GET_DESTS: 3149 { 3150 struct ip_vs_get_dests *get; 3151 int size; 3152 3153 get = (struct ip_vs_get_dests *)arg; 3154 size = struct_size(get, entrytable, get->num_dests); 3155 if (*len != size) { 3156 pr_err("length: %u != %u\n", *len, size); 3157 ret = -EINVAL; 3158 goto out; 3159 } 3160 ret = __ip_vs_get_dest_entries(ipvs, get, user); 3161 } 3162 break; 3163 3164 case IP_VS_SO_GET_TIMEOUT: 3165 { 3166 struct ip_vs_timeout_user t; 3167 3168 __ip_vs_get_timeouts(ipvs, &t); 3169 if (copy_to_user(user, &t, sizeof(t)) != 0) 3170 ret = -EFAULT; 3171 } 3172 break; 3173 3174 default: 3175 ret = -EINVAL; 3176 } 3177 3178 out: 3179 mutex_unlock(&__ip_vs_mutex); 3180 return ret; 3181 } 3182 3183 3184 static struct nf_sockopt_ops ip_vs_sockopts = { 3185 .pf = PF_INET, 3186 .set_optmin = IP_VS_BASE_CTL, 3187 .set_optmax = IP_VS_SO_SET_MAX+1, 3188 .set = do_ip_vs_set_ctl, 3189 .get_optmin = IP_VS_BASE_CTL, 3190 .get_optmax = IP_VS_SO_GET_MAX+1, 3191 .get = do_ip_vs_get_ctl, 3192 .owner = THIS_MODULE, 3193 }; 3194 3195 /* 3196 * Generic Netlink interface 3197 */ 3198 3199 /* IPVS genetlink family */ 3200 static struct genl_family ip_vs_genl_family; 3201 3202 /* Policy used for first-level command attributes */ 3203 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 3204 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, 3205 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, 3206 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, 3207 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, 3208 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, 3209 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, 3210 }; 3211 3212 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ 3213 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { 3214 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, 3215 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 3216 .len = IP_VS_IFNAME_MAXLEN - 1 }, 3217 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 3218 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 3219 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 3220 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 3221 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 3222 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 3223 }; 3224 3225 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ 3226 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { 3227 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, 3228 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, 3229 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, 3230 .len = sizeof(union nf_inet_addr) }, 3231 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, 3232 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 3233 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 3234 .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, 3235 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, 3236 .len = IP_VS_PENAME_MAXLEN }, 3237 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 3238 .len = sizeof(struct ip_vs_flags) }, 3239 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 3240 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, 3241 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, 3242 }; 3243 3244 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ 3245 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { 3246 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, 3247 .len = sizeof(union nf_inet_addr) }, 3248 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, 3249 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, 3250 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, 3251 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, 3252 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, 3253 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, 3254 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, 3255 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, 3256 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, 3257 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, 3258 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, 3259 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, 3260 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, 3261 }; 3262 3263 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, 3264 struct ip_vs_kstats *kstats) 3265 { 3266 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3267 3268 if (!nl_stats) 3269 return -EMSGSIZE; 3270 3271 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || 3272 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || 3273 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || 3274 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3275 IPVS_STATS_ATTR_PAD) || 3276 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3277 IPVS_STATS_ATTR_PAD) || 3278 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || 3279 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || 3280 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || 3281 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || 3282 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) 3283 goto nla_put_failure; 3284 nla_nest_end(skb, nl_stats); 3285 3286 return 0; 3287 3288 nla_put_failure: 3289 nla_nest_cancel(skb, nl_stats); 3290 return -EMSGSIZE; 3291 } 3292 3293 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, 3294 struct ip_vs_kstats *kstats) 3295 { 3296 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3297 3298 if (!nl_stats) 3299 return -EMSGSIZE; 3300 3301 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, 3302 IPVS_STATS_ATTR_PAD) || 3303 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, 3304 IPVS_STATS_ATTR_PAD) || 3305 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, 3306 IPVS_STATS_ATTR_PAD) || 3307 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3308 IPVS_STATS_ATTR_PAD) || 3309 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3310 IPVS_STATS_ATTR_PAD) || 3311 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, 3312 IPVS_STATS_ATTR_PAD) || 3313 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, 3314 IPVS_STATS_ATTR_PAD) || 3315 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, 3316 IPVS_STATS_ATTR_PAD) || 3317 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, 3318 IPVS_STATS_ATTR_PAD) || 3319 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, 3320 IPVS_STATS_ATTR_PAD)) 3321 goto nla_put_failure; 3322 nla_nest_end(skb, nl_stats); 3323 3324 return 0; 3325 3326 nla_put_failure: 3327 nla_nest_cancel(skb, nl_stats); 3328 return -EMSGSIZE; 3329 } 3330 3331 static int ip_vs_genl_fill_service(struct sk_buff *skb, 3332 struct ip_vs_service *svc) 3333 { 3334 struct ip_vs_scheduler *sched; 3335 struct ip_vs_pe *pe; 3336 struct nlattr *nl_service; 3337 struct ip_vs_flags flags = { .flags = svc->flags, 3338 .mask = ~0 }; 3339 struct ip_vs_kstats kstats; 3340 char *sched_name; 3341 3342 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); 3343 if (!nl_service) 3344 return -EMSGSIZE; 3345 3346 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) 3347 goto nla_put_failure; 3348 if (svc->fwmark) { 3349 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) 3350 goto nla_put_failure; 3351 } else { 3352 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || 3353 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || 3354 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) 3355 goto nla_put_failure; 3356 } 3357 3358 sched = rcu_dereference_protected(svc->scheduler, 1); 3359 sched_name = sched ? sched->name : "none"; 3360 pe = rcu_dereference_protected(svc->pe, 1); 3361 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || 3362 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || 3363 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || 3364 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || 3365 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) 3366 goto nla_put_failure; 3367 ip_vs_copy_stats(&kstats, &svc->stats); 3368 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) 3369 goto nla_put_failure; 3370 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) 3371 goto nla_put_failure; 3372 3373 nla_nest_end(skb, nl_service); 3374 3375 return 0; 3376 3377 nla_put_failure: 3378 nla_nest_cancel(skb, nl_service); 3379 return -EMSGSIZE; 3380 } 3381 3382 static int ip_vs_genl_dump_service(struct sk_buff *skb, 3383 struct ip_vs_service *svc, 3384 struct netlink_callback *cb) 3385 { 3386 void *hdr; 3387 3388 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 3389 &ip_vs_genl_family, NLM_F_MULTI, 3390 IPVS_CMD_NEW_SERVICE); 3391 if (!hdr) 3392 return -EMSGSIZE; 3393 3394 if (ip_vs_genl_fill_service(skb, svc) < 0) 3395 goto nla_put_failure; 3396 3397 genlmsg_end(skb, hdr); 3398 return 0; 3399 3400 nla_put_failure: 3401 genlmsg_cancel(skb, hdr); 3402 return -EMSGSIZE; 3403 } 3404 3405 static int ip_vs_genl_dump_services(struct sk_buff *skb, 3406 struct netlink_callback *cb) 3407 { 3408 int idx = 0, i; 3409 int start = cb->args[0]; 3410 struct ip_vs_service *svc; 3411 struct net *net = sock_net(skb->sk); 3412 struct netns_ipvs *ipvs = net_ipvs(net); 3413 3414 mutex_lock(&__ip_vs_mutex); 3415 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { 3416 hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { 3417 if (++idx <= start || (svc->ipvs != ipvs)) 3418 continue; 3419 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 3420 idx--; 3421 goto nla_put_failure; 3422 } 3423 } 3424 } 3425 3426 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { 3427 hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { 3428 if (++idx <= start || (svc->ipvs != ipvs)) 3429 continue; 3430 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 3431 idx--; 3432 goto nla_put_failure; 3433 } 3434 } 3435 } 3436 3437 nla_put_failure: 3438 mutex_unlock(&__ip_vs_mutex); 3439 cb->args[0] = idx; 3440 3441 return skb->len; 3442 } 3443 3444 static bool ip_vs_is_af_valid(int af) 3445 { 3446 if (af == AF_INET) 3447 return true; 3448 #ifdef CONFIG_IP_VS_IPV6 3449 if (af == AF_INET6 && ipv6_mod_enabled()) 3450 return true; 3451 #endif 3452 return false; 3453 } 3454 3455 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, 3456 struct ip_vs_service_user_kern *usvc, 3457 struct nlattr *nla, bool full_entry, 3458 struct ip_vs_service **ret_svc) 3459 { 3460 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 3461 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 3462 struct ip_vs_service *svc; 3463 3464 /* Parse mandatory identifying service fields first */ 3465 if (nla == NULL || 3466 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) 3467 return -EINVAL; 3468 3469 nla_af = attrs[IPVS_SVC_ATTR_AF]; 3470 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; 3471 nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; 3472 nla_port = attrs[IPVS_SVC_ATTR_PORT]; 3473 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; 3474 3475 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) 3476 return -EINVAL; 3477 3478 memset(usvc, 0, sizeof(*usvc)); 3479 3480 usvc->af = nla_get_u16(nla_af); 3481 if (!ip_vs_is_af_valid(usvc->af)) 3482 return -EAFNOSUPPORT; 3483 3484 if (nla_fwmark) { 3485 usvc->protocol = IPPROTO_TCP; 3486 usvc->fwmark = nla_get_u32(nla_fwmark); 3487 } else { 3488 usvc->protocol = nla_get_u16(nla_protocol); 3489 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); 3490 usvc->port = nla_get_be16(nla_port); 3491 usvc->fwmark = 0; 3492 } 3493 3494 rcu_read_lock(); 3495 if (usvc->fwmark) 3496 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); 3497 else 3498 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, 3499 &usvc->addr, usvc->port); 3500 rcu_read_unlock(); 3501 *ret_svc = svc; 3502 3503 /* If a full entry was requested, check for the additional fields */ 3504 if (full_entry) { 3505 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, 3506 *nla_netmask; 3507 struct ip_vs_flags flags; 3508 3509 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 3510 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; 3511 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 3512 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 3513 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 3514 3515 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) 3516 return -EINVAL; 3517 3518 nla_memcpy(&flags, nla_flags, sizeof(flags)); 3519 3520 /* prefill flags from service if it already exists */ 3521 if (svc) 3522 usvc->flags = svc->flags; 3523 3524 /* set new flags from userland */ 3525 usvc->flags = (usvc->flags & ~flags.mask) | 3526 (flags.flags & flags.mask); 3527 usvc->sched_name = nla_data(nla_sched); 3528 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; 3529 usvc->timeout = nla_get_u32(nla_timeout); 3530 usvc->netmask = nla_get_be32(nla_netmask); 3531 } 3532 3533 return 0; 3534 } 3535 3536 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, 3537 struct nlattr *nla) 3538 { 3539 struct ip_vs_service_user_kern usvc; 3540 struct ip_vs_service *svc; 3541 int ret; 3542 3543 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); 3544 return ret ? ERR_PTR(ret) : svc; 3545 } 3546 3547 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 3548 { 3549 struct nlattr *nl_dest; 3550 struct ip_vs_kstats kstats; 3551 3552 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); 3553 if (!nl_dest) 3554 return -EMSGSIZE; 3555 3556 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || 3557 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || 3558 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, 3559 (atomic_read(&dest->conn_flags) & 3560 IP_VS_CONN_F_FWD_MASK)) || 3561 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, 3562 atomic_read(&dest->weight)) || 3563 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, 3564 dest->tun_type) || 3565 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, 3566 dest->tun_port) || 3567 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, 3568 dest->tun_flags) || 3569 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || 3570 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || 3571 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, 3572 atomic_read(&dest->activeconns)) || 3573 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, 3574 atomic_read(&dest->inactconns)) || 3575 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, 3576 atomic_read(&dest->persistconns)) || 3577 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) 3578 goto nla_put_failure; 3579 ip_vs_copy_stats(&kstats, &dest->stats); 3580 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) 3581 goto nla_put_failure; 3582 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) 3583 goto nla_put_failure; 3584 3585 nla_nest_end(skb, nl_dest); 3586 3587 return 0; 3588 3589 nla_put_failure: 3590 nla_nest_cancel(skb, nl_dest); 3591 return -EMSGSIZE; 3592 } 3593 3594 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, 3595 struct netlink_callback *cb) 3596 { 3597 void *hdr; 3598 3599 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 3600 &ip_vs_genl_family, NLM_F_MULTI, 3601 IPVS_CMD_NEW_DEST); 3602 if (!hdr) 3603 return -EMSGSIZE; 3604 3605 if (ip_vs_genl_fill_dest(skb, dest) < 0) 3606 goto nla_put_failure; 3607 3608 genlmsg_end(skb, hdr); 3609 return 0; 3610 3611 nla_put_failure: 3612 genlmsg_cancel(skb, hdr); 3613 return -EMSGSIZE; 3614 } 3615 3616 static int ip_vs_genl_dump_dests(struct sk_buff *skb, 3617 struct netlink_callback *cb) 3618 { 3619 int idx = 0; 3620 int start = cb->args[0]; 3621 struct ip_vs_service *svc; 3622 struct ip_vs_dest *dest; 3623 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 3624 struct net *net = sock_net(skb->sk); 3625 struct netns_ipvs *ipvs = net_ipvs(net); 3626 3627 mutex_lock(&__ip_vs_mutex); 3628 3629 /* Try to find the service for which to dump destinations */ 3630 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) 3631 goto out_err; 3632 3633 3634 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 3635 if (IS_ERR_OR_NULL(svc)) 3636 goto out_err; 3637 3638 /* Dump the destinations */ 3639 list_for_each_entry(dest, &svc->destinations, n_list) { 3640 if (++idx <= start) 3641 continue; 3642 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { 3643 idx--; 3644 goto nla_put_failure; 3645 } 3646 } 3647 3648 nla_put_failure: 3649 cb->args[0] = idx; 3650 3651 out_err: 3652 mutex_unlock(&__ip_vs_mutex); 3653 3654 return skb->len; 3655 } 3656 3657 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, 3658 struct nlattr *nla, bool full_entry) 3659 { 3660 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; 3661 struct nlattr *nla_addr, *nla_port; 3662 struct nlattr *nla_addr_family; 3663 3664 /* Parse mandatory identifying destination fields first */ 3665 if (nla == NULL || 3666 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) 3667 return -EINVAL; 3668 3669 nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; 3670 nla_port = attrs[IPVS_DEST_ATTR_PORT]; 3671 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; 3672 3673 if (!(nla_addr && nla_port)) 3674 return -EINVAL; 3675 3676 memset(udest, 0, sizeof(*udest)); 3677 3678 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); 3679 udest->port = nla_get_be16(nla_port); 3680 3681 if (nla_addr_family) 3682 udest->af = nla_get_u16(nla_addr_family); 3683 else 3684 udest->af = 0; 3685 3686 /* If a full entry was requested, check for the additional fields */ 3687 if (full_entry) { 3688 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, 3689 *nla_l_thresh, *nla_tun_type, *nla_tun_port, 3690 *nla_tun_flags; 3691 3692 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; 3693 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; 3694 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; 3695 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; 3696 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; 3697 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; 3698 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; 3699 3700 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) 3701 return -EINVAL; 3702 3703 udest->conn_flags = nla_get_u32(nla_fwd) 3704 & IP_VS_CONN_F_FWD_MASK; 3705 udest->weight = nla_get_u32(nla_weight); 3706 udest->u_threshold = nla_get_u32(nla_u_thresh); 3707 udest->l_threshold = nla_get_u32(nla_l_thresh); 3708 3709 if (nla_tun_type) 3710 udest->tun_type = nla_get_u8(nla_tun_type); 3711 3712 if (nla_tun_port) 3713 udest->tun_port = nla_get_be16(nla_tun_port); 3714 3715 if (nla_tun_flags) 3716 udest->tun_flags = nla_get_u16(nla_tun_flags); 3717 } 3718 3719 return 0; 3720 } 3721 3722 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 3723 struct ipvs_sync_daemon_cfg *c) 3724 { 3725 struct nlattr *nl_daemon; 3726 3727 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); 3728 if (!nl_daemon) 3729 return -EMSGSIZE; 3730 3731 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 3732 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 3733 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 3734 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 3735 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 3736 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 3737 goto nla_put_failure; 3738 #ifdef CONFIG_IP_VS_IPV6 3739 if (c->mcast_af == AF_INET6) { 3740 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 3741 &c->mcast_group.in6)) 3742 goto nla_put_failure; 3743 } else 3744 #endif 3745 if (c->mcast_af == AF_INET && 3746 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 3747 c->mcast_group.ip)) 3748 goto nla_put_failure; 3749 nla_nest_end(skb, nl_daemon); 3750 3751 return 0; 3752 3753 nla_put_failure: 3754 nla_nest_cancel(skb, nl_daemon); 3755 return -EMSGSIZE; 3756 } 3757 3758 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 3759 struct ipvs_sync_daemon_cfg *c, 3760 struct netlink_callback *cb) 3761 { 3762 void *hdr; 3763 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 3764 &ip_vs_genl_family, NLM_F_MULTI, 3765 IPVS_CMD_NEW_DAEMON); 3766 if (!hdr) 3767 return -EMSGSIZE; 3768 3769 if (ip_vs_genl_fill_daemon(skb, state, c)) 3770 goto nla_put_failure; 3771 3772 genlmsg_end(skb, hdr); 3773 return 0; 3774 3775 nla_put_failure: 3776 genlmsg_cancel(skb, hdr); 3777 return -EMSGSIZE; 3778 } 3779 3780 static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 3781 struct netlink_callback *cb) 3782 { 3783 struct net *net = sock_net(skb->sk); 3784 struct netns_ipvs *ipvs = net_ipvs(net); 3785 3786 mutex_lock(&ipvs->sync_mutex); 3787 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 3788 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 3789 &ipvs->mcfg, cb) < 0) 3790 goto nla_put_failure; 3791 3792 cb->args[0] = 1; 3793 } 3794 3795 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 3796 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 3797 &ipvs->bcfg, cb) < 0) 3798 goto nla_put_failure; 3799 3800 cb->args[1] = 1; 3801 } 3802 3803 nla_put_failure: 3804 mutex_unlock(&ipvs->sync_mutex); 3805 3806 return skb->len; 3807 } 3808 3809 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 3810 { 3811 struct ipvs_sync_daemon_cfg c; 3812 struct nlattr *a; 3813 int ret; 3814 3815 memset(&c, 0, sizeof(c)); 3816 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 3817 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 3818 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 3819 return -EINVAL; 3820 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 3821 sizeof(c.mcast_ifn)); 3822 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 3823 3824 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 3825 if (a) 3826 c.sync_maxlen = nla_get_u16(a); 3827 3828 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 3829 if (a) { 3830 c.mcast_af = AF_INET; 3831 c.mcast_group.ip = nla_get_in_addr(a); 3832 if (!ipv4_is_multicast(c.mcast_group.ip)) 3833 return -EINVAL; 3834 } else { 3835 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 3836 if (a) { 3837 #ifdef CONFIG_IP_VS_IPV6 3838 int addr_type; 3839 3840 c.mcast_af = AF_INET6; 3841 c.mcast_group.in6 = nla_get_in6_addr(a); 3842 addr_type = ipv6_addr_type(&c.mcast_group.in6); 3843 if (!(addr_type & IPV6_ADDR_MULTICAST)) 3844 return -EINVAL; 3845 #else 3846 return -EAFNOSUPPORT; 3847 #endif 3848 } 3849 } 3850 3851 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 3852 if (a) 3853 c.mcast_port = nla_get_u16(a); 3854 3855 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 3856 if (a) 3857 c.mcast_ttl = nla_get_u8(a); 3858 3859 /* The synchronization protocol is incompatible with mixed family 3860 * services 3861 */ 3862 if (ipvs->mixed_address_family_dests > 0) 3863 return -EINVAL; 3864 3865 ret = start_sync_thread(ipvs, &c, 3866 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3867 return ret; 3868 } 3869 3870 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 3871 { 3872 int ret; 3873 3874 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 3875 return -EINVAL; 3876 3877 ret = stop_sync_thread(ipvs, 3878 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3879 return ret; 3880 } 3881 3882 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) 3883 { 3884 struct ip_vs_timeout_user t; 3885 3886 __ip_vs_get_timeouts(ipvs, &t); 3887 3888 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 3889 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 3890 3891 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) 3892 t.tcp_fin_timeout = 3893 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); 3894 3895 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 3896 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 3897 3898 return ip_vs_set_timeout(ipvs, &t); 3899 } 3900 3901 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 3902 { 3903 int ret = -EINVAL, cmd; 3904 struct net *net = sock_net(skb->sk); 3905 struct netns_ipvs *ipvs = net_ipvs(net); 3906 3907 cmd = info->genlhdr->cmd; 3908 3909 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 3910 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 3911 3912 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 3913 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) 3914 goto out; 3915 3916 if (cmd == IPVS_CMD_NEW_DAEMON) 3917 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); 3918 else 3919 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); 3920 } 3921 3922 out: 3923 return ret; 3924 } 3925 3926 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 3927 { 3928 bool need_full_svc = false, need_full_dest = false; 3929 struct ip_vs_service *svc = NULL; 3930 struct ip_vs_service_user_kern usvc; 3931 struct ip_vs_dest_user_kern udest; 3932 int ret = 0, cmd; 3933 struct net *net = sock_net(skb->sk); 3934 struct netns_ipvs *ipvs = net_ipvs(net); 3935 3936 cmd = info->genlhdr->cmd; 3937 3938 mutex_lock(&__ip_vs_mutex); 3939 3940 if (cmd == IPVS_CMD_FLUSH) { 3941 ret = ip_vs_flush(ipvs, false); 3942 goto out; 3943 } else if (cmd == IPVS_CMD_SET_CONFIG) { 3944 ret = ip_vs_genl_set_config(ipvs, info->attrs); 3945 goto out; 3946 } else if (cmd == IPVS_CMD_ZERO && 3947 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 3948 ret = ip_vs_zero_all(ipvs); 3949 goto out; 3950 } 3951 3952 /* All following commands require a service argument, so check if we 3953 * received a valid one. We need a full service specification when 3954 * adding / editing a service. Only identifying members otherwise. */ 3955 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 3956 need_full_svc = true; 3957 3958 ret = ip_vs_genl_parse_service(ipvs, &usvc, 3959 info->attrs[IPVS_CMD_ATTR_SERVICE], 3960 need_full_svc, &svc); 3961 if (ret) 3962 goto out; 3963 3964 /* Unless we're adding a new service, the service must already exist */ 3965 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 3966 ret = -ESRCH; 3967 goto out; 3968 } 3969 3970 /* Destination commands require a valid destination argument. For 3971 * adding / editing a destination, we need a full destination 3972 * specification. */ 3973 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || 3974 cmd == IPVS_CMD_DEL_DEST) { 3975 if (cmd != IPVS_CMD_DEL_DEST) 3976 need_full_dest = true; 3977 3978 ret = ip_vs_genl_parse_dest(&udest, 3979 info->attrs[IPVS_CMD_ATTR_DEST], 3980 need_full_dest); 3981 if (ret) 3982 goto out; 3983 3984 /* Old protocols did not allow the user to specify address 3985 * family, so we set it to zero instead. We also didn't 3986 * allow heterogeneous pools in the old code, so it's safe 3987 * to assume that this will have the same address family as 3988 * the service. 3989 */ 3990 if (udest.af == 0) 3991 udest.af = svc->af; 3992 3993 if (!ip_vs_is_af_valid(udest.af)) { 3994 ret = -EAFNOSUPPORT; 3995 goto out; 3996 } 3997 3998 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { 3999 /* The synchronization protocol is incompatible 4000 * with mixed family services 4001 */ 4002 if (ipvs->sync_state) { 4003 ret = -EINVAL; 4004 goto out; 4005 } 4006 4007 /* Which connection types do we support? */ 4008 switch (udest.conn_flags) { 4009 case IP_VS_CONN_F_TUNNEL: 4010 /* We are able to forward this */ 4011 break; 4012 default: 4013 ret = -EINVAL; 4014 goto out; 4015 } 4016 } 4017 } 4018 4019 switch (cmd) { 4020 case IPVS_CMD_NEW_SERVICE: 4021 if (svc == NULL) 4022 ret = ip_vs_add_service(ipvs, &usvc, &svc); 4023 else 4024 ret = -EEXIST; 4025 break; 4026 case IPVS_CMD_SET_SERVICE: 4027 ret = ip_vs_edit_service(svc, &usvc); 4028 break; 4029 case IPVS_CMD_DEL_SERVICE: 4030 ret = ip_vs_del_service(svc); 4031 /* do not use svc, it can be freed */ 4032 break; 4033 case IPVS_CMD_NEW_DEST: 4034 ret = ip_vs_add_dest(svc, &udest); 4035 break; 4036 case IPVS_CMD_SET_DEST: 4037 ret = ip_vs_edit_dest(svc, &udest); 4038 break; 4039 case IPVS_CMD_DEL_DEST: 4040 ret = ip_vs_del_dest(svc, &udest); 4041 break; 4042 case IPVS_CMD_ZERO: 4043 ret = ip_vs_zero_service(svc); 4044 break; 4045 default: 4046 ret = -EINVAL; 4047 } 4048 4049 out: 4050 mutex_unlock(&__ip_vs_mutex); 4051 4052 return ret; 4053 } 4054 4055 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) 4056 { 4057 struct sk_buff *msg; 4058 void *reply; 4059 int ret, cmd, reply_cmd; 4060 struct net *net = sock_net(skb->sk); 4061 struct netns_ipvs *ipvs = net_ipvs(net); 4062 4063 cmd = info->genlhdr->cmd; 4064 4065 if (cmd == IPVS_CMD_GET_SERVICE) 4066 reply_cmd = IPVS_CMD_NEW_SERVICE; 4067 else if (cmd == IPVS_CMD_GET_INFO) 4068 reply_cmd = IPVS_CMD_SET_INFO; 4069 else if (cmd == IPVS_CMD_GET_CONFIG) 4070 reply_cmd = IPVS_CMD_SET_CONFIG; 4071 else { 4072 pr_err("unknown Generic Netlink command\n"); 4073 return -EINVAL; 4074 } 4075 4076 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4077 if (!msg) 4078 return -ENOMEM; 4079 4080 mutex_lock(&__ip_vs_mutex); 4081 4082 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); 4083 if (reply == NULL) 4084 goto nla_put_failure; 4085 4086 switch (cmd) { 4087 case IPVS_CMD_GET_SERVICE: 4088 { 4089 struct ip_vs_service *svc; 4090 4091 svc = ip_vs_genl_find_service(ipvs, 4092 info->attrs[IPVS_CMD_ATTR_SERVICE]); 4093 if (IS_ERR(svc)) { 4094 ret = PTR_ERR(svc); 4095 goto out_err; 4096 } else if (svc) { 4097 ret = ip_vs_genl_fill_service(msg, svc); 4098 if (ret) 4099 goto nla_put_failure; 4100 } else { 4101 ret = -ESRCH; 4102 goto out_err; 4103 } 4104 4105 break; 4106 } 4107 4108 case IPVS_CMD_GET_CONFIG: 4109 { 4110 struct ip_vs_timeout_user t; 4111 4112 __ip_vs_get_timeouts(ipvs, &t); 4113 #ifdef CONFIG_IP_VS_PROTO_TCP 4114 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, 4115 t.tcp_timeout) || 4116 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 4117 t.tcp_fin_timeout)) 4118 goto nla_put_failure; 4119 #endif 4120 #ifdef CONFIG_IP_VS_PROTO_UDP 4121 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) 4122 goto nla_put_failure; 4123 #endif 4124 4125 break; 4126 } 4127 4128 case IPVS_CMD_GET_INFO: 4129 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, 4130 IP_VS_VERSION_CODE) || 4131 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, 4132 ip_vs_conn_tab_size)) 4133 goto nla_put_failure; 4134 break; 4135 } 4136 4137 genlmsg_end(msg, reply); 4138 ret = genlmsg_reply(msg, info); 4139 goto out; 4140 4141 nla_put_failure: 4142 pr_err("not enough space in Netlink message\n"); 4143 ret = -EMSGSIZE; 4144 4145 out_err: 4146 nlmsg_free(msg); 4147 out: 4148 mutex_unlock(&__ip_vs_mutex); 4149 4150 return ret; 4151 } 4152 4153 4154 static const struct genl_small_ops ip_vs_genl_ops[] = { 4155 { 4156 .cmd = IPVS_CMD_NEW_SERVICE, 4157 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4158 .flags = GENL_ADMIN_PERM, 4159 .doit = ip_vs_genl_set_cmd, 4160 }, 4161 { 4162 .cmd = IPVS_CMD_SET_SERVICE, 4163 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4164 .flags = GENL_ADMIN_PERM, 4165 .doit = ip_vs_genl_set_cmd, 4166 }, 4167 { 4168 .cmd = IPVS_CMD_DEL_SERVICE, 4169 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4170 .flags = GENL_ADMIN_PERM, 4171 .doit = ip_vs_genl_set_cmd, 4172 }, 4173 { 4174 .cmd = IPVS_CMD_GET_SERVICE, 4175 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4176 .flags = GENL_ADMIN_PERM, 4177 .doit = ip_vs_genl_get_cmd, 4178 .dumpit = ip_vs_genl_dump_services, 4179 }, 4180 { 4181 .cmd = IPVS_CMD_NEW_DEST, 4182 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4183 .flags = GENL_ADMIN_PERM, 4184 .doit = ip_vs_genl_set_cmd, 4185 }, 4186 { 4187 .cmd = IPVS_CMD_SET_DEST, 4188 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4189 .flags = GENL_ADMIN_PERM, 4190 .doit = ip_vs_genl_set_cmd, 4191 }, 4192 { 4193 .cmd = IPVS_CMD_DEL_DEST, 4194 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4195 .flags = GENL_ADMIN_PERM, 4196 .doit = ip_vs_genl_set_cmd, 4197 }, 4198 { 4199 .cmd = IPVS_CMD_GET_DEST, 4200 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4201 .flags = GENL_ADMIN_PERM, 4202 .dumpit = ip_vs_genl_dump_dests, 4203 }, 4204 { 4205 .cmd = IPVS_CMD_NEW_DAEMON, 4206 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4207 .flags = GENL_ADMIN_PERM, 4208 .doit = ip_vs_genl_set_daemon, 4209 }, 4210 { 4211 .cmd = IPVS_CMD_DEL_DAEMON, 4212 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4213 .flags = GENL_ADMIN_PERM, 4214 .doit = ip_vs_genl_set_daemon, 4215 }, 4216 { 4217 .cmd = IPVS_CMD_GET_DAEMON, 4218 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4219 .flags = GENL_ADMIN_PERM, 4220 .dumpit = ip_vs_genl_dump_daemons, 4221 }, 4222 { 4223 .cmd = IPVS_CMD_SET_CONFIG, 4224 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4225 .flags = GENL_ADMIN_PERM, 4226 .doit = ip_vs_genl_set_cmd, 4227 }, 4228 { 4229 .cmd = IPVS_CMD_GET_CONFIG, 4230 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4231 .flags = GENL_ADMIN_PERM, 4232 .doit = ip_vs_genl_get_cmd, 4233 }, 4234 { 4235 .cmd = IPVS_CMD_GET_INFO, 4236 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4237 .flags = GENL_ADMIN_PERM, 4238 .doit = ip_vs_genl_get_cmd, 4239 }, 4240 { 4241 .cmd = IPVS_CMD_ZERO, 4242 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4243 .flags = GENL_ADMIN_PERM, 4244 .doit = ip_vs_genl_set_cmd, 4245 }, 4246 { 4247 .cmd = IPVS_CMD_FLUSH, 4248 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4249 .flags = GENL_ADMIN_PERM, 4250 .doit = ip_vs_genl_set_cmd, 4251 }, 4252 }; 4253 4254 static struct genl_family ip_vs_genl_family __ro_after_init = { 4255 .hdrsize = 0, 4256 .name = IPVS_GENL_NAME, 4257 .version = IPVS_GENL_VERSION, 4258 .maxattr = IPVS_CMD_ATTR_MAX, 4259 .policy = ip_vs_cmd_policy, 4260 .netnsok = true, /* Make ipvsadm to work on netns */ 4261 .module = THIS_MODULE, 4262 .small_ops = ip_vs_genl_ops, 4263 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), 4264 .resv_start_op = IPVS_CMD_FLUSH + 1, 4265 }; 4266 4267 static int __init ip_vs_genl_register(void) 4268 { 4269 return genl_register_family(&ip_vs_genl_family); 4270 } 4271 4272 static void ip_vs_genl_unregister(void) 4273 { 4274 genl_unregister_family(&ip_vs_genl_family); 4275 } 4276 4277 /* End of Generic Netlink interface definitions */ 4278 4279 /* 4280 * per netns intit/exit func. 4281 */ 4282 #ifdef CONFIG_SYSCTL 4283 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) 4284 { 4285 struct net *net = ipvs->net; 4286 struct ctl_table *tbl; 4287 int idx, ret; 4288 4289 atomic_set(&ipvs->dropentry, 0); 4290 spin_lock_init(&ipvs->dropentry_lock); 4291 spin_lock_init(&ipvs->droppacket_lock); 4292 spin_lock_init(&ipvs->securetcp_lock); 4293 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); 4294 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, 4295 expire_nodest_conn_handler); 4296 ipvs->est_stopped = 0; 4297 4298 if (!net_eq(net, &init_net)) { 4299 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); 4300 if (tbl == NULL) 4301 return -ENOMEM; 4302 4303 /* Don't export sysctls to unprivileged users */ 4304 if (net->user_ns != &init_user_ns) 4305 tbl[0].procname = NULL; 4306 } else 4307 tbl = vs_vars; 4308 /* Initialize sysctl defaults */ 4309 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { 4310 if (tbl[idx].proc_handler == proc_do_defense_mode) 4311 tbl[idx].extra2 = ipvs; 4312 } 4313 idx = 0; 4314 ipvs->sysctl_amemthresh = 1024; 4315 tbl[idx++].data = &ipvs->sysctl_amemthresh; 4316 ipvs->sysctl_am_droprate = 10; 4317 tbl[idx++].data = &ipvs->sysctl_am_droprate; 4318 tbl[idx++].data = &ipvs->sysctl_drop_entry; 4319 tbl[idx++].data = &ipvs->sysctl_drop_packet; 4320 #ifdef CONFIG_IP_VS_NFCT 4321 tbl[idx++].data = &ipvs->sysctl_conntrack; 4322 #endif 4323 tbl[idx++].data = &ipvs->sysctl_secure_tcp; 4324 ipvs->sysctl_snat_reroute = 1; 4325 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 4326 ipvs->sysctl_sync_ver = 1; 4327 tbl[idx++].data = &ipvs->sysctl_sync_ver; 4328 ipvs->sysctl_sync_ports = 1; 4329 tbl[idx++].data = &ipvs->sysctl_sync_ports; 4330 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; 4331 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 4332 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 4333 ipvs->sysctl_sync_sock_size = 0; 4334 tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 4335 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 4336 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 4337 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; 4338 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; 4339 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; 4340 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; 4341 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 4342 tbl[idx].data = &ipvs->sysctl_sync_threshold; 4343 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 4344 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 4345 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 4346 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 4347 tbl[idx++].data = &ipvs->sysctl_sync_retries; 4348 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 4349 ipvs->sysctl_pmtu_disc = 1; 4350 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 4351 tbl[idx++].data = &ipvs->sysctl_backup_only; 4352 ipvs->sysctl_conn_reuse_mode = 1; 4353 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; 4354 tbl[idx++].data = &ipvs->sysctl_schedule_icmp; 4355 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; 4356 ipvs->sysctl_run_estimation = 1; 4357 tbl[idx].extra2 = ipvs; 4358 tbl[idx++].data = &ipvs->sysctl_run_estimation; 4359 4360 ipvs->est_cpulist_valid = 0; 4361 tbl[idx].extra2 = ipvs; 4362 tbl[idx++].data = &ipvs->sysctl_est_cpulist; 4363 4364 ipvs->sysctl_est_nice = IPVS_EST_NICE; 4365 tbl[idx].extra2 = ipvs; 4366 tbl[idx++].data = &ipvs->sysctl_est_nice; 4367 4368 #ifdef CONFIG_IP_VS_DEBUG 4369 /* Global sysctls must be ro in non-init netns */ 4370 if (!net_eq(net, &init_net)) 4371 tbl[idx++].mode = 0444; 4372 #endif 4373 4374 ret = -ENOMEM; 4375 ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); 4376 if (!ipvs->sysctl_hdr) 4377 goto err; 4378 ipvs->sysctl_tbl = tbl; 4379 4380 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); 4381 if (ret < 0) 4382 goto err; 4383 4384 /* Schedule defense work */ 4385 queue_delayed_work(system_long_wq, &ipvs->defense_work, 4386 DEFENSE_TIMER_PERIOD); 4387 4388 return 0; 4389 4390 err: 4391 unregister_net_sysctl_table(ipvs->sysctl_hdr); 4392 if (!net_eq(net, &init_net)) 4393 kfree(tbl); 4394 return ret; 4395 } 4396 4397 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) 4398 { 4399 struct net *net = ipvs->net; 4400 4401 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); 4402 cancel_delayed_work_sync(&ipvs->defense_work); 4403 cancel_work_sync(&ipvs->defense_work.work); 4404 unregister_net_sysctl_table(ipvs->sysctl_hdr); 4405 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); 4406 4407 if (ipvs->est_cpulist_valid) 4408 free_cpumask_var(ipvs->sysctl_est_cpulist); 4409 4410 if (!net_eq(net, &init_net)) 4411 kfree(ipvs->sysctl_tbl); 4412 } 4413 4414 #else 4415 4416 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } 4417 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } 4418 4419 #endif 4420 4421 static struct notifier_block ip_vs_dst_notifier = { 4422 .notifier_call = ip_vs_dst_event, 4423 #ifdef CONFIG_IP_VS_IPV6 4424 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, 4425 #endif 4426 }; 4427 4428 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) 4429 { 4430 int ret = -ENOMEM; 4431 int idx; 4432 4433 /* Initialize rs_table */ 4434 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) 4435 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 4436 4437 INIT_LIST_HEAD(&ipvs->dest_trash); 4438 spin_lock_init(&ipvs->dest_trash_lock); 4439 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); 4440 atomic_set(&ipvs->ftpsvc_counter, 0); 4441 atomic_set(&ipvs->nullsvc_counter, 0); 4442 atomic_set(&ipvs->conn_out_counter, 0); 4443 4444 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 4445 4446 /* procfs stats */ 4447 ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); 4448 if (!ipvs->tot_stats) 4449 goto out; 4450 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) 4451 goto err_tot_stats; 4452 4453 #ifdef CONFIG_PROC_FS 4454 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, 4455 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) 4456 goto err_vs; 4457 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, 4458 ip_vs_stats_show, NULL)) 4459 goto err_stats; 4460 if (!proc_create_net_single("ip_vs_stats_percpu", 0, 4461 ipvs->net->proc_net, 4462 ip_vs_stats_percpu_show, NULL)) 4463 goto err_percpu; 4464 #endif 4465 4466 ret = ip_vs_control_net_init_sysctl(ipvs); 4467 if (ret < 0) 4468 goto err; 4469 4470 return 0; 4471 4472 err: 4473 #ifdef CONFIG_PROC_FS 4474 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 4475 4476 err_percpu: 4477 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 4478 4479 err_stats: 4480 remove_proc_entry("ip_vs", ipvs->net->proc_net); 4481 4482 err_vs: 4483 #endif 4484 ip_vs_stats_release(&ipvs->tot_stats->s); 4485 4486 err_tot_stats: 4487 kfree(ipvs->tot_stats); 4488 4489 out: 4490 return ret; 4491 } 4492 4493 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) 4494 { 4495 ip_vs_trash_cleanup(ipvs); 4496 ip_vs_control_net_cleanup_sysctl(ipvs); 4497 cancel_delayed_work_sync(&ipvs->est_reload_work); 4498 #ifdef CONFIG_PROC_FS 4499 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 4500 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 4501 remove_proc_entry("ip_vs", ipvs->net->proc_net); 4502 #endif 4503 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); 4504 } 4505 4506 int __init ip_vs_register_nl_ioctl(void) 4507 { 4508 int ret; 4509 4510 ret = nf_register_sockopt(&ip_vs_sockopts); 4511 if (ret) { 4512 pr_err("cannot register sockopt.\n"); 4513 goto err_sock; 4514 } 4515 4516 ret = ip_vs_genl_register(); 4517 if (ret) { 4518 pr_err("cannot register Generic Netlink interface.\n"); 4519 goto err_genl; 4520 } 4521 return 0; 4522 4523 err_genl: 4524 nf_unregister_sockopt(&ip_vs_sockopts); 4525 err_sock: 4526 return ret; 4527 } 4528 4529 void ip_vs_unregister_nl_ioctl(void) 4530 { 4531 ip_vs_genl_unregister(); 4532 nf_unregister_sockopt(&ip_vs_sockopts); 4533 } 4534 4535 int __init ip_vs_control_init(void) 4536 { 4537 int idx; 4538 int ret; 4539 4540 EnterFunction(2); 4541 4542 /* Initialize svc_table, ip_vs_svc_fwm_table */ 4543 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 4544 INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); 4545 INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); 4546 } 4547 4548 smp_wmb(); /* Do we really need it now ? */ 4549 4550 ret = register_netdevice_notifier(&ip_vs_dst_notifier); 4551 if (ret < 0) 4552 return ret; 4553 4554 LeaveFunction(2); 4555 return 0; 4556 } 4557 4558 4559 void ip_vs_control_cleanup(void) 4560 { 4561 EnterFunction(2); 4562 unregister_netdevice_notifier(&ip_vs_dst_notifier); 4563 /* relying on common rcu_barrier() in ip_vs_cleanup() */ 4564 LeaveFunction(2); 4565 } 4566