1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25 #define KMSG_COMPONENT "IPVS" 26 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 27 28 #include <linux/interrupt.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/net.h> 32 #include <linux/kernel.h> 33 #include <linux/module.h> 34 #include <linux/vmalloc.h> 35 #include <linux/proc_fs.h> /* for proc_net_* */ 36 #include <linux/slab.h> 37 #include <linux/seq_file.h> 38 #include <linux/jhash.h> 39 #include <linux/random.h> 40 41 #include <net/net_namespace.h> 42 #include <net/ip_vs.h> 43 44 45 #ifndef CONFIG_IP_VS_TAB_BITS 46 #define CONFIG_IP_VS_TAB_BITS 12 47 #endif 48 49 /* 50 * Connection hash size. Default is what was selected at compile time. 51 */ 52 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 53 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 54 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 55 56 /* size and mask values */ 57 int ip_vs_conn_tab_size __read_mostly; 58 static int ip_vs_conn_tab_mask __read_mostly; 59 60 /* 61 * Connection hash table: for input and output packets lookups of IPVS 62 */ 63 static struct hlist_head *ip_vs_conn_tab __read_mostly; 64 65 /* SLAB cache for IPVS connections */ 66 static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 67 68 /* counter for no client port connections */ 69 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 70 71 /* random value for IPVS connection hash */ 72 static unsigned int ip_vs_conn_rnd __read_mostly; 73 74 /* 75 * Fine locking granularity for big connection hash table 76 */ 77 #define CT_LOCKARRAY_BITS 5 78 #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 79 #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 80 81 /* We need an addrstrlen that works with or without v6 */ 82 #ifdef CONFIG_IP_VS_IPV6 83 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN 84 #else 85 #define IP_VS_ADDRSTRLEN (8+1) 86 #endif 87 88 struct ip_vs_aligned_lock 89 { 90 spinlock_t l; 91 } __attribute__((__aligned__(SMP_CACHE_BYTES))); 92 93 /* lock array for conn table */ 94 static struct ip_vs_aligned_lock 95 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 96 97 static inline void ct_write_lock_bh(unsigned int key) 98 { 99 spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 100 } 101 102 static inline void ct_write_unlock_bh(unsigned int key) 103 { 104 spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 105 } 106 107 static void ip_vs_conn_expire(unsigned long data); 108 109 /* 110 * Returns hash value for IPVS connection entry 111 */ 112 static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, 113 const union nf_inet_addr *addr, 114 __be16 port) 115 { 116 #ifdef CONFIG_IP_VS_IPV6 117 if (af == AF_INET6) 118 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 119 (__force u32)port, proto, ip_vs_conn_rnd) ^ 120 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 121 #endif 122 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 123 ip_vs_conn_rnd) ^ 124 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 125 } 126 127 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 128 bool inverse) 129 { 130 const union nf_inet_addr *addr; 131 __be16 port; 132 133 if (p->pe_data && p->pe->hashkey_raw) 134 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & 135 ip_vs_conn_tab_mask; 136 137 if (likely(!inverse)) { 138 addr = p->caddr; 139 port = p->cport; 140 } else { 141 addr = p->vaddr; 142 port = p->vport; 143 } 144 145 return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); 146 } 147 148 static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 149 { 150 struct ip_vs_conn_param p; 151 152 ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, 153 &cp->caddr, cp->cport, NULL, 0, &p); 154 155 if (cp->pe) { 156 p.pe = cp->pe; 157 p.pe_data = cp->pe_data; 158 p.pe_data_len = cp->pe_data_len; 159 } 160 161 return ip_vs_conn_hashkey_param(&p, false); 162 } 163 164 /* 165 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. 166 * returns bool success. 167 */ 168 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 169 { 170 unsigned int hash; 171 int ret; 172 173 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 174 return 0; 175 176 /* Hash by protocol, client address and port */ 177 hash = ip_vs_conn_hashkey_conn(cp); 178 179 ct_write_lock_bh(hash); 180 spin_lock(&cp->lock); 181 182 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 183 cp->flags |= IP_VS_CONN_F_HASHED; 184 atomic_inc(&cp->refcnt); 185 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); 186 ret = 1; 187 } else { 188 pr_err("%s(): request for already hashed, called from %pF\n", 189 __func__, __builtin_return_address(0)); 190 ret = 0; 191 } 192 193 spin_unlock(&cp->lock); 194 ct_write_unlock_bh(hash); 195 196 return ret; 197 } 198 199 200 /* 201 * UNhashes ip_vs_conn from ip_vs_conn_tab. 202 * returns bool success. Caller should hold conn reference. 203 */ 204 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 205 { 206 unsigned int hash; 207 int ret; 208 209 /* unhash it and decrease its reference counter */ 210 hash = ip_vs_conn_hashkey_conn(cp); 211 212 ct_write_lock_bh(hash); 213 spin_lock(&cp->lock); 214 215 if (cp->flags & IP_VS_CONN_F_HASHED) { 216 hlist_del_rcu(&cp->c_list); 217 cp->flags &= ~IP_VS_CONN_F_HASHED; 218 atomic_dec(&cp->refcnt); 219 ret = 1; 220 } else 221 ret = 0; 222 223 spin_unlock(&cp->lock); 224 ct_write_unlock_bh(hash); 225 226 return ret; 227 } 228 229 /* Try to unlink ip_vs_conn from ip_vs_conn_tab. 230 * returns bool success. 231 */ 232 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) 233 { 234 unsigned int hash; 235 bool ret; 236 237 hash = ip_vs_conn_hashkey_conn(cp); 238 239 ct_write_lock_bh(hash); 240 spin_lock(&cp->lock); 241 242 if (cp->flags & IP_VS_CONN_F_HASHED) { 243 ret = false; 244 /* Decrease refcnt and unlink conn only if we are last user */ 245 if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { 246 hlist_del_rcu(&cp->c_list); 247 cp->flags &= ~IP_VS_CONN_F_HASHED; 248 ret = true; 249 } 250 } else 251 ret = atomic_read(&cp->refcnt) ? false : true; 252 253 spin_unlock(&cp->lock); 254 ct_write_unlock_bh(hash); 255 256 return ret; 257 } 258 259 260 /* 261 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 262 * Called for pkts coming from OUTside-to-INside. 263 * p->caddr, p->cport: pkt source address (foreign host) 264 * p->vaddr, p->vport: pkt dest address (load balancer) 265 */ 266 static inline struct ip_vs_conn * 267 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 268 { 269 unsigned int hash; 270 struct ip_vs_conn *cp; 271 272 hash = ip_vs_conn_hashkey_param(p, false); 273 274 rcu_read_lock(); 275 276 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 277 if (p->cport == cp->cport && p->vport == cp->vport && 278 cp->af == p->af && 279 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 280 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 281 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 282 p->protocol == cp->protocol && 283 cp->ipvs == p->ipvs) { 284 if (!__ip_vs_conn_get(cp)) 285 continue; 286 /* HIT */ 287 rcu_read_unlock(); 288 return cp; 289 } 290 } 291 292 rcu_read_unlock(); 293 294 return NULL; 295 } 296 297 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 298 { 299 struct ip_vs_conn *cp; 300 301 cp = __ip_vs_conn_in_get(p); 302 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { 303 struct ip_vs_conn_param cport_zero_p = *p; 304 cport_zero_p.cport = 0; 305 cp = __ip_vs_conn_in_get(&cport_zero_p); 306 } 307 308 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 309 ip_vs_proto_name(p->protocol), 310 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 311 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 312 cp ? "hit" : "not hit"); 313 314 return cp; 315 } 316 317 static int 318 ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, 319 int af, const struct sk_buff *skb, 320 const struct ip_vs_iphdr *iph, 321 struct ip_vs_conn_param *p) 322 { 323 __be16 _ports[2], *pptr; 324 325 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 326 if (pptr == NULL) 327 return 1; 328 329 if (likely(!ip_vs_iph_inverse(iph))) 330 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, 331 pptr[0], &iph->daddr, pptr[1], p); 332 else 333 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, 334 pptr[1], &iph->saddr, pptr[0], p); 335 return 0; 336 } 337 338 struct ip_vs_conn * 339 ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, 340 const struct sk_buff *skb, 341 const struct ip_vs_iphdr *iph) 342 { 343 struct ip_vs_conn_param p; 344 345 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) 346 return NULL; 347 348 return ip_vs_conn_in_get(&p); 349 } 350 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 351 352 /* Get reference to connection template */ 353 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) 354 { 355 unsigned int hash; 356 struct ip_vs_conn *cp; 357 358 hash = ip_vs_conn_hashkey_param(p, false); 359 360 rcu_read_lock(); 361 362 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 363 if (unlikely(p->pe_data && p->pe->ct_match)) { 364 if (cp->ipvs != p->ipvs) 365 continue; 366 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { 367 if (__ip_vs_conn_get(cp)) 368 goto out; 369 } 370 continue; 371 } 372 373 if (cp->af == p->af && 374 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 375 /* protocol should only be IPPROTO_IP if 376 * p->vaddr is a fwmark */ 377 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : 378 p->af, p->vaddr, &cp->vaddr) && 379 p->vport == cp->vport && p->cport == cp->cport && 380 cp->flags & IP_VS_CONN_F_TEMPLATE && 381 p->protocol == cp->protocol && 382 cp->ipvs == p->ipvs) { 383 if (__ip_vs_conn_get(cp)) 384 goto out; 385 } 386 } 387 cp = NULL; 388 389 out: 390 rcu_read_unlock(); 391 392 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 393 ip_vs_proto_name(p->protocol), 394 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 395 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 396 cp ? "hit" : "not hit"); 397 398 return cp; 399 } 400 401 /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 402 * Called for pkts coming from inside-to-OUTside. 403 * p->caddr, p->cport: pkt source address (inside host) 404 * p->vaddr, p->vport: pkt dest address (foreign host) */ 405 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) 406 { 407 unsigned int hash; 408 struct ip_vs_conn *cp, *ret=NULL; 409 410 /* 411 * Check for "full" addressed entries 412 */ 413 hash = ip_vs_conn_hashkey_param(p, true); 414 415 rcu_read_lock(); 416 417 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 418 if (p->vport == cp->cport && p->cport == cp->dport && 419 cp->af == p->af && 420 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 421 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 422 p->protocol == cp->protocol && 423 cp->ipvs == p->ipvs) { 424 if (!__ip_vs_conn_get(cp)) 425 continue; 426 /* HIT */ 427 ret = cp; 428 break; 429 } 430 } 431 432 rcu_read_unlock(); 433 434 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 435 ip_vs_proto_name(p->protocol), 436 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 437 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 438 ret ? "hit" : "not hit"); 439 440 return ret; 441 } 442 443 struct ip_vs_conn * 444 ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, 445 const struct sk_buff *skb, 446 const struct ip_vs_iphdr *iph) 447 { 448 struct ip_vs_conn_param p; 449 450 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) 451 return NULL; 452 453 return ip_vs_conn_out_get(&p); 454 } 455 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 456 457 static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) 458 { 459 __ip_vs_conn_put(cp); 460 ip_vs_conn_expire((unsigned long)cp); 461 } 462 463 /* 464 * Put back the conn and restart its timer with its timeout 465 */ 466 static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) 467 { 468 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 469 0 : cp->timeout; 470 mod_timer(&cp->timer, jiffies+t); 471 472 __ip_vs_conn_put(cp); 473 } 474 475 void ip_vs_conn_put(struct ip_vs_conn *cp) 476 { 477 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && 478 (atomic_read(&cp->refcnt) == 1) && 479 !timer_pending(&cp->timer)) 480 /* expire connection immediately */ 481 __ip_vs_conn_put_notimer(cp); 482 else 483 __ip_vs_conn_put_timer(cp); 484 } 485 486 /* 487 * Fill a no_client_port connection with a client port number 488 */ 489 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 490 { 491 if (ip_vs_conn_unhash(cp)) { 492 spin_lock_bh(&cp->lock); 493 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 494 atomic_dec(&ip_vs_conn_no_cport_cnt); 495 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 496 cp->cport = cport; 497 } 498 spin_unlock_bh(&cp->lock); 499 500 /* hash on new dport */ 501 ip_vs_conn_hash(cp); 502 } 503 } 504 505 506 /* 507 * Bind a connection entry with the corresponding packet_xmit. 508 * Called by ip_vs_conn_new. 509 */ 510 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 511 { 512 switch (IP_VS_FWD_METHOD(cp)) { 513 case IP_VS_CONN_F_MASQ: 514 cp->packet_xmit = ip_vs_nat_xmit; 515 break; 516 517 case IP_VS_CONN_F_TUNNEL: 518 #ifdef CONFIG_IP_VS_IPV6 519 if (cp->daf == AF_INET6) 520 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 521 else 522 #endif 523 cp->packet_xmit = ip_vs_tunnel_xmit; 524 break; 525 526 case IP_VS_CONN_F_DROUTE: 527 cp->packet_xmit = ip_vs_dr_xmit; 528 break; 529 530 case IP_VS_CONN_F_LOCALNODE: 531 cp->packet_xmit = ip_vs_null_xmit; 532 break; 533 534 case IP_VS_CONN_F_BYPASS: 535 cp->packet_xmit = ip_vs_bypass_xmit; 536 break; 537 } 538 } 539 540 #ifdef CONFIG_IP_VS_IPV6 541 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) 542 { 543 switch (IP_VS_FWD_METHOD(cp)) { 544 case IP_VS_CONN_F_MASQ: 545 cp->packet_xmit = ip_vs_nat_xmit_v6; 546 break; 547 548 case IP_VS_CONN_F_TUNNEL: 549 if (cp->daf == AF_INET6) 550 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 551 else 552 cp->packet_xmit = ip_vs_tunnel_xmit; 553 break; 554 555 case IP_VS_CONN_F_DROUTE: 556 cp->packet_xmit = ip_vs_dr_xmit_v6; 557 break; 558 559 case IP_VS_CONN_F_LOCALNODE: 560 cp->packet_xmit = ip_vs_null_xmit; 561 break; 562 563 case IP_VS_CONN_F_BYPASS: 564 cp->packet_xmit = ip_vs_bypass_xmit_v6; 565 break; 566 } 567 } 568 #endif 569 570 571 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 572 { 573 return atomic_read(&dest->activeconns) 574 + atomic_read(&dest->inactconns); 575 } 576 577 /* 578 * Bind a connection entry with a virtual service destination 579 * Called just after a new connection entry is created. 580 */ 581 static inline void 582 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 583 { 584 unsigned int conn_flags; 585 __u32 flags; 586 587 /* if dest is NULL, then return directly */ 588 if (!dest) 589 return; 590 591 /* Increase the refcnt counter of the dest */ 592 ip_vs_dest_hold(dest); 593 594 conn_flags = atomic_read(&dest->conn_flags); 595 if (cp->protocol != IPPROTO_UDP) 596 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; 597 flags = cp->flags; 598 /* Bind with the destination and its corresponding transmitter */ 599 if (flags & IP_VS_CONN_F_SYNC) { 600 /* if the connection is not template and is created 601 * by sync, preserve the activity flag. 602 */ 603 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 604 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 605 /* connections inherit forwarding method from dest */ 606 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); 607 } 608 flags |= conn_flags; 609 cp->flags = flags; 610 cp->dest = dest; 611 612 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 613 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 614 "dest->refcnt:%d\n", 615 ip_vs_proto_name(cp->protocol), 616 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 617 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 618 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 619 ip_vs_fwd_tag(cp), cp->state, 620 cp->flags, atomic_read(&cp->refcnt), 621 atomic_read(&dest->refcnt)); 622 623 /* Update the connection counters */ 624 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 625 /* It is a normal connection, so modify the counters 626 * according to the flags, later the protocol can 627 * update them on state change 628 */ 629 if (!(flags & IP_VS_CONN_F_INACTIVE)) 630 atomic_inc(&dest->activeconns); 631 else 632 atomic_inc(&dest->inactconns); 633 } else { 634 /* It is a persistent connection/template, so increase 635 the persistent connection counter */ 636 atomic_inc(&dest->persistconns); 637 } 638 639 if (dest->u_threshold != 0 && 640 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 641 dest->flags |= IP_VS_DEST_F_OVERLOAD; 642 } 643 644 645 /* 646 * Check if there is a destination for the connection, if so 647 * bind the connection to the destination. 648 */ 649 void ip_vs_try_bind_dest(struct ip_vs_conn *cp) 650 { 651 struct ip_vs_dest *dest; 652 653 rcu_read_lock(); 654 655 /* This function is only invoked by the synchronization code. We do 656 * not currently support heterogeneous pools with synchronization, 657 * so we can make the assumption that the svc_af is the same as the 658 * dest_af 659 */ 660 dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, 661 cp->dport, &cp->vaddr, cp->vport, 662 cp->protocol, cp->fwmark, cp->flags); 663 if (dest) { 664 struct ip_vs_proto_data *pd; 665 666 spin_lock_bh(&cp->lock); 667 if (cp->dest) { 668 spin_unlock_bh(&cp->lock); 669 rcu_read_unlock(); 670 return; 671 } 672 673 /* Applications work depending on the forwarding method 674 * but better to reassign them always when binding dest */ 675 if (cp->app) 676 ip_vs_unbind_app(cp); 677 678 ip_vs_bind_dest(cp, dest); 679 spin_unlock_bh(&cp->lock); 680 681 /* Update its packet transmitter */ 682 cp->packet_xmit = NULL; 683 #ifdef CONFIG_IP_VS_IPV6 684 if (cp->af == AF_INET6) 685 ip_vs_bind_xmit_v6(cp); 686 else 687 #endif 688 ip_vs_bind_xmit(cp); 689 690 pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); 691 if (pd && atomic_read(&pd->appcnt)) 692 ip_vs_bind_app(cp, pd->pp); 693 } 694 rcu_read_unlock(); 695 } 696 697 698 /* 699 * Unbind a connection entry with its VS destination 700 * Called by the ip_vs_conn_expire function. 701 */ 702 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 703 { 704 struct ip_vs_dest *dest = cp->dest; 705 706 if (!dest) 707 return; 708 709 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " 710 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 711 "dest->refcnt:%d\n", 712 ip_vs_proto_name(cp->protocol), 713 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 714 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 715 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 716 ip_vs_fwd_tag(cp), cp->state, 717 cp->flags, atomic_read(&cp->refcnt), 718 atomic_read(&dest->refcnt)); 719 720 /* Update the connection counters */ 721 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 722 /* It is a normal connection, so decrease the inactconns 723 or activeconns counter */ 724 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 725 atomic_dec(&dest->inactconns); 726 } else { 727 atomic_dec(&dest->activeconns); 728 } 729 } else { 730 /* It is a persistent connection/template, so decrease 731 the persistent connection counter */ 732 atomic_dec(&dest->persistconns); 733 } 734 735 if (dest->l_threshold != 0) { 736 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 737 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 738 } else if (dest->u_threshold != 0) { 739 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 740 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 741 } else { 742 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 743 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 744 } 745 746 ip_vs_dest_put(dest); 747 } 748 749 static int expire_quiescent_template(struct netns_ipvs *ipvs, 750 struct ip_vs_dest *dest) 751 { 752 #ifdef CONFIG_SYSCTL 753 return ipvs->sysctl_expire_quiescent_template && 754 (atomic_read(&dest->weight) == 0); 755 #else 756 return 0; 757 #endif 758 } 759 760 /* 761 * Checking if the destination of a connection template is available. 762 * If available, return 1, otherwise invalidate this connection 763 * template and return 0. 764 */ 765 int ip_vs_check_template(struct ip_vs_conn *ct) 766 { 767 struct ip_vs_dest *dest = ct->dest; 768 struct netns_ipvs *ipvs = ct->ipvs; 769 770 /* 771 * Checking the dest server status. 772 */ 773 if ((dest == NULL) || 774 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 775 expire_quiescent_template(ipvs, dest)) { 776 IP_VS_DBG_BUF(9, "check_template: dest not available for " 777 "protocol %s s:%s:%d v:%s:%d " 778 "-> d:%s:%d\n", 779 ip_vs_proto_name(ct->protocol), 780 IP_VS_DBG_ADDR(ct->af, &ct->caddr), 781 ntohs(ct->cport), 782 IP_VS_DBG_ADDR(ct->af, &ct->vaddr), 783 ntohs(ct->vport), 784 IP_VS_DBG_ADDR(ct->daf, &ct->daddr), 785 ntohs(ct->dport)); 786 787 /* 788 * Invalidate the connection template 789 */ 790 if (ct->vport != htons(0xffff)) { 791 if (ip_vs_conn_unhash(ct)) { 792 ct->dport = htons(0xffff); 793 ct->vport = htons(0xffff); 794 ct->cport = 0; 795 ip_vs_conn_hash(ct); 796 } 797 } 798 799 /* 800 * Simply decrease the refcnt of the template, 801 * don't restart its timer. 802 */ 803 __ip_vs_conn_put(ct); 804 return 0; 805 } 806 return 1; 807 } 808 809 static void ip_vs_conn_rcu_free(struct rcu_head *head) 810 { 811 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, 812 rcu_head); 813 814 ip_vs_pe_put(cp->pe); 815 kfree(cp->pe_data); 816 kmem_cache_free(ip_vs_conn_cachep, cp); 817 } 818 819 static void ip_vs_conn_expire(unsigned long data) 820 { 821 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 822 struct netns_ipvs *ipvs = cp->ipvs; 823 824 /* 825 * do I control anybody? 826 */ 827 if (atomic_read(&cp->n_control)) 828 goto expire_later; 829 830 /* Unlink conn if not referenced anymore */ 831 if (likely(ip_vs_conn_unlink(cp))) { 832 /* delete the timer if it is activated by other users */ 833 del_timer(&cp->timer); 834 835 /* does anybody control me? */ 836 if (cp->control) 837 ip_vs_control_del(cp); 838 839 if ((cp->flags & IP_VS_CONN_F_NFCT) && 840 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { 841 /* Do not access conntracks during subsys cleanup 842 * because nf_conntrack_find_get can not be used after 843 * conntrack cleanup for the net. 844 */ 845 smp_rmb(); 846 if (ipvs->enable) 847 ip_vs_conn_drop_conntrack(cp); 848 } 849 850 if (unlikely(cp->app != NULL)) 851 ip_vs_unbind_app(cp); 852 ip_vs_unbind_dest(cp); 853 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 854 atomic_dec(&ip_vs_conn_no_cport_cnt); 855 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 856 ip_vs_conn_rcu_free(&cp->rcu_head); 857 else 858 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); 859 atomic_dec(&ipvs->conn_count); 860 return; 861 } 862 863 expire_later: 864 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", 865 atomic_read(&cp->refcnt), 866 atomic_read(&cp->n_control)); 867 868 atomic_inc(&cp->refcnt); 869 cp->timeout = 60*HZ; 870 871 if (ipvs->sync_state & IP_VS_STATE_MASTER) 872 ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); 873 874 __ip_vs_conn_put_timer(cp); 875 } 876 877 /* Modify timer, so that it expires as soon as possible. 878 * Can be called without reference only if under RCU lock. 879 */ 880 void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 881 { 882 /* Using mod_timer_pending will ensure the timer is not 883 * modified after the final del_timer in ip_vs_conn_expire. 884 */ 885 if (timer_pending(&cp->timer) && 886 time_after(cp->timer.expires, jiffies)) 887 mod_timer_pending(&cp->timer, jiffies); 888 } 889 890 891 /* 892 * Create a new connection entry and hash it into the ip_vs_conn_tab 893 */ 894 struct ip_vs_conn * 895 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, 896 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, 897 struct ip_vs_dest *dest, __u32 fwmark) 898 { 899 struct ip_vs_conn *cp; 900 struct netns_ipvs *ipvs = p->ipvs; 901 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, 902 p->protocol); 903 904 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 905 if (cp == NULL) { 906 IP_VS_ERR_RL("%s(): no memory\n", __func__); 907 return NULL; 908 } 909 910 INIT_HLIST_NODE(&cp->c_list); 911 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 912 cp->ipvs = ipvs; 913 cp->af = p->af; 914 cp->daf = dest_af; 915 cp->protocol = p->protocol; 916 ip_vs_addr_set(p->af, &cp->caddr, p->caddr); 917 cp->cport = p->cport; 918 /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ 919 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, 920 &cp->vaddr, p->vaddr); 921 cp->vport = p->vport; 922 ip_vs_addr_set(cp->daf, &cp->daddr, daddr); 923 cp->dport = dport; 924 cp->flags = flags; 925 cp->fwmark = fwmark; 926 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { 927 ip_vs_pe_get(p->pe); 928 cp->pe = p->pe; 929 cp->pe_data = p->pe_data; 930 cp->pe_data_len = p->pe_data_len; 931 } else { 932 cp->pe = NULL; 933 cp->pe_data = NULL; 934 cp->pe_data_len = 0; 935 } 936 spin_lock_init(&cp->lock); 937 938 /* 939 * Set the entry is referenced by the current thread before hashing 940 * it in the table, so that other thread run ip_vs_random_dropentry 941 * but cannot drop this entry. 942 */ 943 atomic_set(&cp->refcnt, 1); 944 945 cp->control = NULL; 946 atomic_set(&cp->n_control, 0); 947 atomic_set(&cp->in_pkts, 0); 948 949 cp->packet_xmit = NULL; 950 cp->app = NULL; 951 cp->app_data = NULL; 952 /* reset struct ip_vs_seq */ 953 cp->in_seq.delta = 0; 954 cp->out_seq.delta = 0; 955 956 atomic_inc(&ipvs->conn_count); 957 if (flags & IP_VS_CONN_F_NO_CPORT) 958 atomic_inc(&ip_vs_conn_no_cport_cnt); 959 960 /* Bind the connection with a destination server */ 961 cp->dest = NULL; 962 ip_vs_bind_dest(cp, dest); 963 964 /* Set its state and timeout */ 965 cp->state = 0; 966 cp->old_state = 0; 967 cp->timeout = 3*HZ; 968 cp->sync_endtime = jiffies & ~3UL; 969 970 /* Bind its packet transmitter */ 971 #ifdef CONFIG_IP_VS_IPV6 972 if (p->af == AF_INET6) 973 ip_vs_bind_xmit_v6(cp); 974 else 975 #endif 976 ip_vs_bind_xmit(cp); 977 978 if (unlikely(pd && atomic_read(&pd->appcnt))) 979 ip_vs_bind_app(cp, pd->pp); 980 981 /* 982 * Allow conntrack to be preserved. By default, conntrack 983 * is created and destroyed for every packet. 984 * Sometimes keeping conntrack can be useful for 985 * IP_VS_CONN_F_ONE_PACKET too. 986 */ 987 988 if (ip_vs_conntrack_enabled(ipvs)) 989 cp->flags |= IP_VS_CONN_F_NFCT; 990 991 /* Hash it in the ip_vs_conn_tab finally */ 992 ip_vs_conn_hash(cp); 993 994 return cp; 995 } 996 997 /* 998 * /proc/net/ip_vs_conn entries 999 */ 1000 #ifdef CONFIG_PROC_FS 1001 struct ip_vs_iter_state { 1002 struct seq_net_private p; 1003 struct hlist_head *l; 1004 }; 1005 1006 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 1007 { 1008 int idx; 1009 struct ip_vs_conn *cp; 1010 struct ip_vs_iter_state *iter = seq->private; 1011 1012 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1013 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1014 /* __ip_vs_conn_get() is not needed by 1015 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show 1016 */ 1017 if (pos-- == 0) { 1018 iter->l = &ip_vs_conn_tab[idx]; 1019 return cp; 1020 } 1021 } 1022 cond_resched_rcu(); 1023 } 1024 1025 return NULL; 1026 } 1027 1028 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 1029 __acquires(RCU) 1030 { 1031 struct ip_vs_iter_state *iter = seq->private; 1032 1033 iter->l = NULL; 1034 rcu_read_lock(); 1035 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 1036 } 1037 1038 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1039 { 1040 struct ip_vs_conn *cp = v; 1041 struct ip_vs_iter_state *iter = seq->private; 1042 struct hlist_node *e; 1043 struct hlist_head *l = iter->l; 1044 int idx; 1045 1046 ++*pos; 1047 if (v == SEQ_START_TOKEN) 1048 return ip_vs_conn_array(seq, 0); 1049 1050 /* more on same hash chain? */ 1051 e = rcu_dereference(hlist_next_rcu(&cp->c_list)); 1052 if (e) 1053 return hlist_entry(e, struct ip_vs_conn, c_list); 1054 1055 idx = l - ip_vs_conn_tab; 1056 while (++idx < ip_vs_conn_tab_size) { 1057 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1058 iter->l = &ip_vs_conn_tab[idx]; 1059 return cp; 1060 } 1061 cond_resched_rcu(); 1062 } 1063 iter->l = NULL; 1064 return NULL; 1065 } 1066 1067 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 1068 __releases(RCU) 1069 { 1070 rcu_read_unlock(); 1071 } 1072 1073 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 1074 { 1075 1076 if (v == SEQ_START_TOKEN) 1077 seq_puts(seq, 1078 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 1079 else { 1080 const struct ip_vs_conn *cp = v; 1081 struct net *net = seq_file_net(seq); 1082 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 1083 size_t len = 0; 1084 char dbuf[IP_VS_ADDRSTRLEN]; 1085 1086 if (!net_eq(cp->ipvs->net, net)) 1087 return 0; 1088 if (cp->pe_data) { 1089 pe_data[0] = ' '; 1090 len = strlen(cp->pe->name); 1091 memcpy(pe_data + 1, cp->pe->name, len); 1092 pe_data[len + 1] = ' '; 1093 len += 2; 1094 len += cp->pe->show_pe_data(cp, pe_data + len); 1095 } 1096 pe_data[len] = '\0'; 1097 1098 #ifdef CONFIG_IP_VS_IPV6 1099 if (cp->daf == AF_INET6) 1100 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1101 else 1102 #endif 1103 snprintf(dbuf, sizeof(dbuf), "%08X", 1104 ntohl(cp->daddr.ip)); 1105 1106 #ifdef CONFIG_IP_VS_IPV6 1107 if (cp->af == AF_INET6) 1108 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1109 "%s %04X %-11s %7lu%s\n", 1110 ip_vs_proto_name(cp->protocol), 1111 &cp->caddr.in6, ntohs(cp->cport), 1112 &cp->vaddr.in6, ntohs(cp->vport), 1113 dbuf, ntohs(cp->dport), 1114 ip_vs_state_name(cp->protocol, cp->state), 1115 (cp->timer.expires-jiffies)/HZ, pe_data); 1116 else 1117 #endif 1118 seq_printf(seq, 1119 "%-3s %08X %04X %08X %04X" 1120 " %s %04X %-11s %7lu%s\n", 1121 ip_vs_proto_name(cp->protocol), 1122 ntohl(cp->caddr.ip), ntohs(cp->cport), 1123 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1124 dbuf, ntohs(cp->dport), 1125 ip_vs_state_name(cp->protocol, cp->state), 1126 (cp->timer.expires-jiffies)/HZ, pe_data); 1127 } 1128 return 0; 1129 } 1130 1131 static const struct seq_operations ip_vs_conn_seq_ops = { 1132 .start = ip_vs_conn_seq_start, 1133 .next = ip_vs_conn_seq_next, 1134 .stop = ip_vs_conn_seq_stop, 1135 .show = ip_vs_conn_seq_show, 1136 }; 1137 1138 static int ip_vs_conn_open(struct inode *inode, struct file *file) 1139 { 1140 return seq_open_net(inode, file, &ip_vs_conn_seq_ops, 1141 sizeof(struct ip_vs_iter_state)); 1142 } 1143 1144 static const struct file_operations ip_vs_conn_fops = { 1145 .owner = THIS_MODULE, 1146 .open = ip_vs_conn_open, 1147 .read = seq_read, 1148 .llseek = seq_lseek, 1149 .release = seq_release_net, 1150 }; 1151 1152 static const char *ip_vs_origin_name(unsigned int flags) 1153 { 1154 if (flags & IP_VS_CONN_F_SYNC) 1155 return "SYNC"; 1156 else 1157 return "LOCAL"; 1158 } 1159 1160 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 1161 { 1162 char dbuf[IP_VS_ADDRSTRLEN]; 1163 1164 if (v == SEQ_START_TOKEN) 1165 seq_puts(seq, 1166 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1167 else { 1168 const struct ip_vs_conn *cp = v; 1169 struct net *net = seq_file_net(seq); 1170 1171 if (!net_eq(cp->ipvs->net, net)) 1172 return 0; 1173 1174 #ifdef CONFIG_IP_VS_IPV6 1175 if (cp->daf == AF_INET6) 1176 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1177 else 1178 #endif 1179 snprintf(dbuf, sizeof(dbuf), "%08X", 1180 ntohl(cp->daddr.ip)); 1181 1182 #ifdef CONFIG_IP_VS_IPV6 1183 if (cp->af == AF_INET6) 1184 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1185 "%s %04X %-11s %-6s %7lu\n", 1186 ip_vs_proto_name(cp->protocol), 1187 &cp->caddr.in6, ntohs(cp->cport), 1188 &cp->vaddr.in6, ntohs(cp->vport), 1189 dbuf, ntohs(cp->dport), 1190 ip_vs_state_name(cp->protocol, cp->state), 1191 ip_vs_origin_name(cp->flags), 1192 (cp->timer.expires-jiffies)/HZ); 1193 else 1194 #endif 1195 seq_printf(seq, 1196 "%-3s %08X %04X %08X %04X " 1197 "%s %04X %-11s %-6s %7lu\n", 1198 ip_vs_proto_name(cp->protocol), 1199 ntohl(cp->caddr.ip), ntohs(cp->cport), 1200 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1201 dbuf, ntohs(cp->dport), 1202 ip_vs_state_name(cp->protocol, cp->state), 1203 ip_vs_origin_name(cp->flags), 1204 (cp->timer.expires-jiffies)/HZ); 1205 } 1206 return 0; 1207 } 1208 1209 static const struct seq_operations ip_vs_conn_sync_seq_ops = { 1210 .start = ip_vs_conn_seq_start, 1211 .next = ip_vs_conn_seq_next, 1212 .stop = ip_vs_conn_seq_stop, 1213 .show = ip_vs_conn_sync_seq_show, 1214 }; 1215 1216 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 1217 { 1218 return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, 1219 sizeof(struct ip_vs_iter_state)); 1220 } 1221 1222 static const struct file_operations ip_vs_conn_sync_fops = { 1223 .owner = THIS_MODULE, 1224 .open = ip_vs_conn_sync_open, 1225 .read = seq_read, 1226 .llseek = seq_lseek, 1227 .release = seq_release_net, 1228 }; 1229 1230 #endif 1231 1232 1233 /* 1234 * Randomly drop connection entries before running out of memory 1235 */ 1236 static inline int todrop_entry(struct ip_vs_conn *cp) 1237 { 1238 /* 1239 * The drop rate array needs tuning for real environments. 1240 * Called from timer bh only => no locking 1241 */ 1242 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 1243 static char todrop_counter[9] = {0}; 1244 int i; 1245 1246 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 1247 This will leave enough time for normal connection to get 1248 through. */ 1249 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 1250 return 0; 1251 1252 /* Don't drop the entry if its number of incoming packets is not 1253 located in [0, 8] */ 1254 i = atomic_read(&cp->in_pkts); 1255 if (i > 8 || i < 0) return 0; 1256 1257 if (!todrop_rate[i]) return 0; 1258 if (--todrop_counter[i] > 0) return 0; 1259 1260 todrop_counter[i] = todrop_rate[i]; 1261 return 1; 1262 } 1263 1264 static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) 1265 { 1266 struct ip_vs_service *svc; 1267 1268 if (!cp->dest) 1269 return false; 1270 svc = rcu_dereference(cp->dest->svc); 1271 return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); 1272 } 1273 1274 /* Called from keventd and must protect itself from softirqs */ 1275 void ip_vs_random_dropentry(struct netns_ipvs *ipvs) 1276 { 1277 int idx; 1278 struct ip_vs_conn *cp, *cp_c; 1279 1280 rcu_read_lock(); 1281 /* 1282 * Randomly scan 1/32 of the whole table every second 1283 */ 1284 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1285 unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; 1286 1287 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1288 if (cp->ipvs != ipvs) 1289 continue; 1290 if (cp->flags & IP_VS_CONN_F_TEMPLATE) { 1291 if (atomic_read(&cp->n_control) || 1292 !ip_vs_conn_ops_mode(cp)) 1293 continue; 1294 else 1295 /* connection template of OPS */ 1296 goto try_drop; 1297 } 1298 if (cp->protocol == IPPROTO_TCP) { 1299 switch(cp->state) { 1300 case IP_VS_TCP_S_SYN_RECV: 1301 case IP_VS_TCP_S_SYNACK: 1302 break; 1303 1304 case IP_VS_TCP_S_ESTABLISHED: 1305 if (todrop_entry(cp)) 1306 break; 1307 continue; 1308 1309 default: 1310 continue; 1311 } 1312 } else if (cp->protocol == IPPROTO_SCTP) { 1313 switch (cp->state) { 1314 case IP_VS_SCTP_S_INIT1: 1315 case IP_VS_SCTP_S_INIT: 1316 break; 1317 case IP_VS_SCTP_S_ESTABLISHED: 1318 if (todrop_entry(cp)) 1319 break; 1320 continue; 1321 default: 1322 continue; 1323 } 1324 } else { 1325 try_drop: 1326 if (!todrop_entry(cp)) 1327 continue; 1328 } 1329 1330 IP_VS_DBG(4, "del connection\n"); 1331 ip_vs_conn_expire_now(cp); 1332 cp_c = cp->control; 1333 /* cp->control is valid only with reference to cp */ 1334 if (cp_c && __ip_vs_conn_get(cp)) { 1335 IP_VS_DBG(4, "del conn template\n"); 1336 ip_vs_conn_expire_now(cp_c); 1337 __ip_vs_conn_put(cp); 1338 } 1339 } 1340 cond_resched_rcu(); 1341 } 1342 rcu_read_unlock(); 1343 } 1344 1345 1346 /* 1347 * Flush all the connection entries in the ip_vs_conn_tab 1348 */ 1349 static void ip_vs_conn_flush(struct netns_ipvs *ipvs) 1350 { 1351 int idx; 1352 struct ip_vs_conn *cp, *cp_c; 1353 1354 flush_again: 1355 rcu_read_lock(); 1356 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1357 1358 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1359 if (cp->ipvs != ipvs) 1360 continue; 1361 IP_VS_DBG(4, "del connection\n"); 1362 ip_vs_conn_expire_now(cp); 1363 cp_c = cp->control; 1364 /* cp->control is valid only with reference to cp */ 1365 if (cp_c && __ip_vs_conn_get(cp)) { 1366 IP_VS_DBG(4, "del conn template\n"); 1367 ip_vs_conn_expire_now(cp_c); 1368 __ip_vs_conn_put(cp); 1369 } 1370 } 1371 cond_resched_rcu(); 1372 } 1373 rcu_read_unlock(); 1374 1375 /* the counter may be not NULL, because maybe some conn entries 1376 are run by slow timer handler or unhashed but still referred */ 1377 if (atomic_read(&ipvs->conn_count) != 0) { 1378 schedule(); 1379 goto flush_again; 1380 } 1381 } 1382 /* 1383 * per netns init and exit 1384 */ 1385 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) 1386 { 1387 atomic_set(&ipvs->conn_count, 0); 1388 1389 proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops); 1390 proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net, 1391 &ip_vs_conn_sync_fops); 1392 return 0; 1393 } 1394 1395 void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) 1396 { 1397 /* flush all the connection entries first */ 1398 ip_vs_conn_flush(ipvs); 1399 remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); 1400 remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); 1401 } 1402 1403 int __init ip_vs_conn_init(void) 1404 { 1405 int idx; 1406 1407 /* Compute size and mask */ 1408 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1409 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1410 1411 /* 1412 * Allocate the connection hash table and initialize its list heads 1413 */ 1414 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); 1415 if (!ip_vs_conn_tab) 1416 return -ENOMEM; 1417 1418 /* Allocate ip_vs_conn slab cache */ 1419 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 1420 sizeof(struct ip_vs_conn), 0, 1421 SLAB_HWCACHE_ALIGN, NULL); 1422 if (!ip_vs_conn_cachep) { 1423 vfree(ip_vs_conn_tab); 1424 return -ENOMEM; 1425 } 1426 1427 pr_info("Connection hash table configured " 1428 "(size=%d, memory=%ldKbytes)\n", 1429 ip_vs_conn_tab_size, 1430 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1431 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1432 sizeof(struct ip_vs_conn)); 1433 1434 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1435 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 1436 1437 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1438 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); 1439 } 1440 1441 /* calculate the random value for connection hash */ 1442 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1443 1444 return 0; 1445 } 1446 1447 void ip_vs_conn_cleanup(void) 1448 { 1449 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ 1450 rcu_barrier(); 1451 /* Release the empty cache */ 1452 kmem_cache_destroy(ip_vs_conn_cachep); 1453 vfree(ip_vs_conn_tab); 1454 } 1455