1 /* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25 #define KMSG_COMPONENT "IPVS" 26 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 27 28 #include <linux/interrupt.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/net.h> 32 #include <linux/kernel.h> 33 #include <linux/module.h> 34 #include <linux/vmalloc.h> 35 #include <linux/proc_fs.h> /* for proc_net_* */ 36 #include <linux/slab.h> 37 #include <linux/seq_file.h> 38 #include <linux/jhash.h> 39 #include <linux/random.h> 40 41 #include <net/net_namespace.h> 42 #include <net/ip_vs.h> 43 44 45 #ifndef CONFIG_IP_VS_TAB_BITS 46 #define CONFIG_IP_VS_TAB_BITS 12 47 #endif 48 49 /* 50 * Connection hash size. Default is what was selected at compile time. 51 */ 52 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 53 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 54 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 55 56 /* size and mask values */ 57 int ip_vs_conn_tab_size __read_mostly; 58 static int ip_vs_conn_tab_mask __read_mostly; 59 60 /* 61 * Connection hash table: for input and output packets lookups of IPVS 62 */ 63 static struct hlist_head *ip_vs_conn_tab __read_mostly; 64 65 /* SLAB cache for IPVS connections */ 66 static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 67 68 /* counter for no client port connections */ 69 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 70 71 /* random value for IPVS connection hash */ 72 static unsigned int ip_vs_conn_rnd __read_mostly; 73 74 /* 75 * Fine locking granularity for big connection hash table 76 */ 77 #define CT_LOCKARRAY_BITS 5 78 #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 79 #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 80 81 /* We need an addrstrlen that works with or without v6 */ 82 #ifdef CONFIG_IP_VS_IPV6 83 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN 84 #else 85 #define IP_VS_ADDRSTRLEN (8+1) 86 #endif 87 88 struct ip_vs_aligned_lock 89 { 90 spinlock_t l; 91 } __attribute__((__aligned__(SMP_CACHE_BYTES))); 92 93 /* lock array for conn table */ 94 static struct ip_vs_aligned_lock 95 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 96 97 static inline void ct_write_lock_bh(unsigned int key) 98 { 99 spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 100 } 101 102 static inline void ct_write_unlock_bh(unsigned int key) 103 { 104 spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 105 } 106 107 static void ip_vs_conn_expire(unsigned long data); 108 109 /* 110 * Returns hash value for IPVS connection entry 111 */ 112 static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, 113 const union nf_inet_addr *addr, 114 __be16 port) 115 { 116 #ifdef CONFIG_IP_VS_IPV6 117 if (af == AF_INET6) 118 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 119 (__force u32)port, proto, ip_vs_conn_rnd) ^ 120 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 121 #endif 122 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 123 ip_vs_conn_rnd) ^ 124 ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; 125 } 126 127 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 128 bool inverse) 129 { 130 const union nf_inet_addr *addr; 131 __be16 port; 132 133 if (p->pe_data && p->pe->hashkey_raw) 134 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & 135 ip_vs_conn_tab_mask; 136 137 if (likely(!inverse)) { 138 addr = p->caddr; 139 port = p->cport; 140 } else { 141 addr = p->vaddr; 142 port = p->vport; 143 } 144 145 return ip_vs_conn_hashkey(p->ipvs, p->af, p->protocol, addr, port); 146 } 147 148 static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 149 { 150 struct ip_vs_conn_param p; 151 152 ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol, 153 &cp->caddr, cp->cport, NULL, 0, &p); 154 155 if (cp->pe) { 156 p.pe = cp->pe; 157 p.pe_data = cp->pe_data; 158 p.pe_data_len = cp->pe_data_len; 159 } 160 161 return ip_vs_conn_hashkey_param(&p, false); 162 } 163 164 /* 165 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. 166 * returns bool success. 167 */ 168 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 169 { 170 unsigned int hash; 171 int ret; 172 173 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 174 return 0; 175 176 /* Hash by protocol, client address and port */ 177 hash = ip_vs_conn_hashkey_conn(cp); 178 179 ct_write_lock_bh(hash); 180 spin_lock(&cp->lock); 181 182 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 183 cp->flags |= IP_VS_CONN_F_HASHED; 184 atomic_inc(&cp->refcnt); 185 hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); 186 ret = 1; 187 } else { 188 pr_err("%s(): request for already hashed, called from %pF\n", 189 __func__, __builtin_return_address(0)); 190 ret = 0; 191 } 192 193 spin_unlock(&cp->lock); 194 ct_write_unlock_bh(hash); 195 196 return ret; 197 } 198 199 200 /* 201 * UNhashes ip_vs_conn from ip_vs_conn_tab. 202 * returns bool success. Caller should hold conn reference. 203 */ 204 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 205 { 206 unsigned int hash; 207 int ret; 208 209 /* unhash it and decrease its reference counter */ 210 hash = ip_vs_conn_hashkey_conn(cp); 211 212 ct_write_lock_bh(hash); 213 spin_lock(&cp->lock); 214 215 if (cp->flags & IP_VS_CONN_F_HASHED) { 216 hlist_del_rcu(&cp->c_list); 217 cp->flags &= ~IP_VS_CONN_F_HASHED; 218 atomic_dec(&cp->refcnt); 219 ret = 1; 220 } else 221 ret = 0; 222 223 spin_unlock(&cp->lock); 224 ct_write_unlock_bh(hash); 225 226 return ret; 227 } 228 229 /* Try to unlink ip_vs_conn from ip_vs_conn_tab. 230 * returns bool success. 231 */ 232 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) 233 { 234 unsigned int hash; 235 bool ret; 236 237 hash = ip_vs_conn_hashkey_conn(cp); 238 239 ct_write_lock_bh(hash); 240 spin_lock(&cp->lock); 241 242 if (cp->flags & IP_VS_CONN_F_HASHED) { 243 ret = false; 244 /* Decrease refcnt and unlink conn only if we are last user */ 245 if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { 246 hlist_del_rcu(&cp->c_list); 247 cp->flags &= ~IP_VS_CONN_F_HASHED; 248 ret = true; 249 } 250 } else 251 ret = atomic_read(&cp->refcnt) ? false : true; 252 253 spin_unlock(&cp->lock); 254 ct_write_unlock_bh(hash); 255 256 return ret; 257 } 258 259 260 /* 261 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 262 * Called for pkts coming from OUTside-to-INside. 263 * p->caddr, p->cport: pkt source address (foreign host) 264 * p->vaddr, p->vport: pkt dest address (load balancer) 265 */ 266 static inline struct ip_vs_conn * 267 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 268 { 269 unsigned int hash; 270 struct ip_vs_conn *cp; 271 272 hash = ip_vs_conn_hashkey_param(p, false); 273 274 rcu_read_lock(); 275 276 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 277 if (p->cport == cp->cport && p->vport == cp->vport && 278 cp->af == p->af && 279 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 280 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 281 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 282 p->protocol == cp->protocol && 283 cp->ipvs == p->ipvs) { 284 if (!__ip_vs_conn_get(cp)) 285 continue; 286 /* HIT */ 287 rcu_read_unlock(); 288 return cp; 289 } 290 } 291 292 rcu_read_unlock(); 293 294 return NULL; 295 } 296 297 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) 298 { 299 struct ip_vs_conn *cp; 300 301 cp = __ip_vs_conn_in_get(p); 302 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { 303 struct ip_vs_conn_param cport_zero_p = *p; 304 cport_zero_p.cport = 0; 305 cp = __ip_vs_conn_in_get(&cport_zero_p); 306 } 307 308 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 309 ip_vs_proto_name(p->protocol), 310 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 311 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 312 cp ? "hit" : "not hit"); 313 314 return cp; 315 } 316 317 static int 318 ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, 319 int af, const struct sk_buff *skb, 320 const struct ip_vs_iphdr *iph, 321 struct ip_vs_conn_param *p) 322 { 323 __be16 _ports[2], *pptr; 324 325 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); 326 if (pptr == NULL) 327 return 1; 328 329 if (likely(!ip_vs_iph_inverse(iph))) 330 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr, 331 pptr[0], &iph->daddr, pptr[1], p); 332 else 333 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr, 334 pptr[1], &iph->saddr, pptr[0], p); 335 return 0; 336 } 337 338 struct ip_vs_conn * 339 ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, 340 const struct sk_buff *skb, 341 const struct ip_vs_iphdr *iph) 342 { 343 struct ip_vs_conn_param p; 344 345 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) 346 return NULL; 347 348 return ip_vs_conn_in_get(&p); 349 } 350 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 351 352 /* Get reference to connection template */ 353 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) 354 { 355 unsigned int hash; 356 struct ip_vs_conn *cp; 357 358 hash = ip_vs_conn_hashkey_param(p, false); 359 360 rcu_read_lock(); 361 362 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 363 if (unlikely(p->pe_data && p->pe->ct_match)) { 364 if (cp->ipvs != p->ipvs) 365 continue; 366 if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { 367 if (__ip_vs_conn_get(cp)) 368 goto out; 369 } 370 continue; 371 } 372 373 if (cp->af == p->af && 374 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 375 /* protocol should only be IPPROTO_IP if 376 * p->vaddr is a fwmark */ 377 ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : 378 p->af, p->vaddr, &cp->vaddr) && 379 p->vport == cp->vport && p->cport == cp->cport && 380 cp->flags & IP_VS_CONN_F_TEMPLATE && 381 p->protocol == cp->protocol && 382 cp->ipvs == p->ipvs) { 383 if (__ip_vs_conn_get(cp)) 384 goto out; 385 } 386 } 387 cp = NULL; 388 389 out: 390 rcu_read_unlock(); 391 392 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 393 ip_vs_proto_name(p->protocol), 394 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 395 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 396 cp ? "hit" : "not hit"); 397 398 return cp; 399 } 400 401 /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 402 * Called for pkts coming from inside-to-OUTside. 403 * p->caddr, p->cport: pkt source address (inside host) 404 * p->vaddr, p->vport: pkt dest address (foreign host) */ 405 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) 406 { 407 unsigned int hash; 408 struct ip_vs_conn *cp, *ret=NULL; 409 410 /* 411 * Check for "full" addressed entries 412 */ 413 hash = ip_vs_conn_hashkey_param(p, true); 414 415 rcu_read_lock(); 416 417 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 418 if (p->vport == cp->cport && p->cport == cp->dport && 419 cp->af == p->af && 420 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 421 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 422 p->protocol == cp->protocol && 423 cp->ipvs == p->ipvs) { 424 if (!__ip_vs_conn_get(cp)) 425 continue; 426 /* HIT */ 427 ret = cp; 428 break; 429 } 430 } 431 432 rcu_read_unlock(); 433 434 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 435 ip_vs_proto_name(p->protocol), 436 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), 437 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), 438 ret ? "hit" : "not hit"); 439 440 return ret; 441 } 442 443 struct ip_vs_conn * 444 ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, 445 const struct sk_buff *skb, 446 const struct ip_vs_iphdr *iph) 447 { 448 struct ip_vs_conn_param p; 449 450 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p)) 451 return NULL; 452 453 return ip_vs_conn_out_get(&p); 454 } 455 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 456 457 static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) 458 { 459 __ip_vs_conn_put(cp); 460 ip_vs_conn_expire((unsigned long)cp); 461 } 462 463 /* 464 * Put back the conn and restart its timer with its timeout 465 */ 466 static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) 467 { 468 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 469 0 : cp->timeout; 470 mod_timer(&cp->timer, jiffies+t); 471 472 __ip_vs_conn_put(cp); 473 } 474 475 void ip_vs_conn_put(struct ip_vs_conn *cp) 476 { 477 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && 478 (atomic_read(&cp->refcnt) == 1) && 479 !timer_pending(&cp->timer)) 480 /* expire connection immediately */ 481 __ip_vs_conn_put_notimer(cp); 482 else 483 __ip_vs_conn_put_timer(cp); 484 } 485 486 /* 487 * Fill a no_client_port connection with a client port number 488 */ 489 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 490 { 491 if (ip_vs_conn_unhash(cp)) { 492 spin_lock_bh(&cp->lock); 493 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 494 atomic_dec(&ip_vs_conn_no_cport_cnt); 495 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 496 cp->cport = cport; 497 } 498 spin_unlock_bh(&cp->lock); 499 500 /* hash on new dport */ 501 ip_vs_conn_hash(cp); 502 } 503 } 504 505 506 /* 507 * Bind a connection entry with the corresponding packet_xmit. 508 * Called by ip_vs_conn_new. 509 */ 510 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 511 { 512 switch (IP_VS_FWD_METHOD(cp)) { 513 case IP_VS_CONN_F_MASQ: 514 cp->packet_xmit = ip_vs_nat_xmit; 515 break; 516 517 case IP_VS_CONN_F_TUNNEL: 518 #ifdef CONFIG_IP_VS_IPV6 519 if (cp->daf == AF_INET6) 520 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 521 else 522 #endif 523 cp->packet_xmit = ip_vs_tunnel_xmit; 524 break; 525 526 case IP_VS_CONN_F_DROUTE: 527 cp->packet_xmit = ip_vs_dr_xmit; 528 break; 529 530 case IP_VS_CONN_F_LOCALNODE: 531 cp->packet_xmit = ip_vs_null_xmit; 532 break; 533 534 case IP_VS_CONN_F_BYPASS: 535 cp->packet_xmit = ip_vs_bypass_xmit; 536 break; 537 } 538 } 539 540 #ifdef CONFIG_IP_VS_IPV6 541 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) 542 { 543 switch (IP_VS_FWD_METHOD(cp)) { 544 case IP_VS_CONN_F_MASQ: 545 cp->packet_xmit = ip_vs_nat_xmit_v6; 546 break; 547 548 case IP_VS_CONN_F_TUNNEL: 549 if (cp->daf == AF_INET6) 550 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 551 else 552 cp->packet_xmit = ip_vs_tunnel_xmit; 553 break; 554 555 case IP_VS_CONN_F_DROUTE: 556 cp->packet_xmit = ip_vs_dr_xmit_v6; 557 break; 558 559 case IP_VS_CONN_F_LOCALNODE: 560 cp->packet_xmit = ip_vs_null_xmit; 561 break; 562 563 case IP_VS_CONN_F_BYPASS: 564 cp->packet_xmit = ip_vs_bypass_xmit_v6; 565 break; 566 } 567 } 568 #endif 569 570 571 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 572 { 573 return atomic_read(&dest->activeconns) 574 + atomic_read(&dest->inactconns); 575 } 576 577 /* 578 * Bind a connection entry with a virtual service destination 579 * Called just after a new connection entry is created. 580 */ 581 static inline void 582 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 583 { 584 unsigned int conn_flags; 585 __u32 flags; 586 587 /* if dest is NULL, then return directly */ 588 if (!dest) 589 return; 590 591 /* Increase the refcnt counter of the dest */ 592 ip_vs_dest_hold(dest); 593 594 conn_flags = atomic_read(&dest->conn_flags); 595 if (cp->protocol != IPPROTO_UDP) 596 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; 597 flags = cp->flags; 598 /* Bind with the destination and its corresponding transmitter */ 599 if (flags & IP_VS_CONN_F_SYNC) { 600 /* if the connection is not template and is created 601 * by sync, preserve the activity flag. 602 */ 603 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 604 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 605 /* connections inherit forwarding method from dest */ 606 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); 607 } 608 flags |= conn_flags; 609 cp->flags = flags; 610 cp->dest = dest; 611 612 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 613 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 614 "dest->refcnt:%d\n", 615 ip_vs_proto_name(cp->protocol), 616 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 617 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 618 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 619 ip_vs_fwd_tag(cp), cp->state, 620 cp->flags, atomic_read(&cp->refcnt), 621 atomic_read(&dest->refcnt)); 622 623 /* Update the connection counters */ 624 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 625 /* It is a normal connection, so modify the counters 626 * according to the flags, later the protocol can 627 * update them on state change 628 */ 629 if (!(flags & IP_VS_CONN_F_INACTIVE)) 630 atomic_inc(&dest->activeconns); 631 else 632 atomic_inc(&dest->inactconns); 633 } else { 634 /* It is a persistent connection/template, so increase 635 the persistent connection counter */ 636 atomic_inc(&dest->persistconns); 637 } 638 639 if (dest->u_threshold != 0 && 640 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 641 dest->flags |= IP_VS_DEST_F_OVERLOAD; 642 } 643 644 645 /* 646 * Check if there is a destination for the connection, if so 647 * bind the connection to the destination. 648 */ 649 void ip_vs_try_bind_dest(struct ip_vs_conn *cp) 650 { 651 struct ip_vs_dest *dest; 652 653 rcu_read_lock(); 654 655 /* This function is only invoked by the synchronization code. We do 656 * not currently support heterogeneous pools with synchronization, 657 * so we can make the assumption that the svc_af is the same as the 658 * dest_af 659 */ 660 dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr, 661 cp->dport, &cp->vaddr, cp->vport, 662 cp->protocol, cp->fwmark, cp->flags); 663 if (dest) { 664 struct ip_vs_proto_data *pd; 665 666 spin_lock_bh(&cp->lock); 667 if (cp->dest) { 668 spin_unlock_bh(&cp->lock); 669 rcu_read_unlock(); 670 return; 671 } 672 673 /* Applications work depending on the forwarding method 674 * but better to reassign them always when binding dest */ 675 if (cp->app) 676 ip_vs_unbind_app(cp); 677 678 ip_vs_bind_dest(cp, dest); 679 spin_unlock_bh(&cp->lock); 680 681 /* Update its packet transmitter */ 682 cp->packet_xmit = NULL; 683 #ifdef CONFIG_IP_VS_IPV6 684 if (cp->af == AF_INET6) 685 ip_vs_bind_xmit_v6(cp); 686 else 687 #endif 688 ip_vs_bind_xmit(cp); 689 690 pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol); 691 if (pd && atomic_read(&pd->appcnt)) 692 ip_vs_bind_app(cp, pd->pp); 693 } 694 rcu_read_unlock(); 695 } 696 697 698 /* 699 * Unbind a connection entry with its VS destination 700 * Called by the ip_vs_conn_expire function. 701 */ 702 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 703 { 704 struct ip_vs_dest *dest = cp->dest; 705 706 if (!dest) 707 return; 708 709 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " 710 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 711 "dest->refcnt:%d\n", 712 ip_vs_proto_name(cp->protocol), 713 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 714 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 715 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 716 ip_vs_fwd_tag(cp), cp->state, 717 cp->flags, atomic_read(&cp->refcnt), 718 atomic_read(&dest->refcnt)); 719 720 /* Update the connection counters */ 721 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 722 /* It is a normal connection, so decrease the inactconns 723 or activeconns counter */ 724 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 725 atomic_dec(&dest->inactconns); 726 } else { 727 atomic_dec(&dest->activeconns); 728 } 729 } else { 730 /* It is a persistent connection/template, so decrease 731 the persistent connection counter */ 732 atomic_dec(&dest->persistconns); 733 } 734 735 if (dest->l_threshold != 0) { 736 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 737 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 738 } else if (dest->u_threshold != 0) { 739 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 740 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 741 } else { 742 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 743 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 744 } 745 746 ip_vs_dest_put(dest); 747 } 748 749 static int expire_quiescent_template(struct netns_ipvs *ipvs, 750 struct ip_vs_dest *dest) 751 { 752 #ifdef CONFIG_SYSCTL 753 return ipvs->sysctl_expire_quiescent_template && 754 (atomic_read(&dest->weight) == 0); 755 #else 756 return 0; 757 #endif 758 } 759 760 /* 761 * Checking if the destination of a connection template is available. 762 * If available, return 1, otherwise invalidate this connection 763 * template and return 0. 764 */ 765 int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) 766 { 767 struct ip_vs_dest *dest = ct->dest; 768 struct netns_ipvs *ipvs = ct->ipvs; 769 770 /* 771 * Checking the dest server status. 772 */ 773 if ((dest == NULL) || 774 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 775 expire_quiescent_template(ipvs, dest) || 776 (cdest && (dest != cdest))) { 777 IP_VS_DBG_BUF(9, "check_template: dest not available for " 778 "protocol %s s:%s:%d v:%s:%d " 779 "-> d:%s:%d\n", 780 ip_vs_proto_name(ct->protocol), 781 IP_VS_DBG_ADDR(ct->af, &ct->caddr), 782 ntohs(ct->cport), 783 IP_VS_DBG_ADDR(ct->af, &ct->vaddr), 784 ntohs(ct->vport), 785 IP_VS_DBG_ADDR(ct->daf, &ct->daddr), 786 ntohs(ct->dport)); 787 788 /* 789 * Invalidate the connection template 790 */ 791 if (ct->vport != htons(0xffff)) { 792 if (ip_vs_conn_unhash(ct)) { 793 ct->dport = htons(0xffff); 794 ct->vport = htons(0xffff); 795 ct->cport = 0; 796 ip_vs_conn_hash(ct); 797 } 798 } 799 800 /* 801 * Simply decrease the refcnt of the template, 802 * don't restart its timer. 803 */ 804 __ip_vs_conn_put(ct); 805 return 0; 806 } 807 return 1; 808 } 809 810 static void ip_vs_conn_rcu_free(struct rcu_head *head) 811 { 812 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, 813 rcu_head); 814 815 ip_vs_pe_put(cp->pe); 816 kfree(cp->pe_data); 817 kmem_cache_free(ip_vs_conn_cachep, cp); 818 } 819 820 static void ip_vs_conn_expire(unsigned long data) 821 { 822 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 823 struct netns_ipvs *ipvs = cp->ipvs; 824 825 /* 826 * do I control anybody? 827 */ 828 if (atomic_read(&cp->n_control)) 829 goto expire_later; 830 831 /* Unlink conn if not referenced anymore */ 832 if (likely(ip_vs_conn_unlink(cp))) { 833 /* delete the timer if it is activated by other users */ 834 del_timer(&cp->timer); 835 836 /* does anybody control me? */ 837 if (cp->control) 838 ip_vs_control_del(cp); 839 840 if ((cp->flags & IP_VS_CONN_F_NFCT) && 841 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { 842 /* Do not access conntracks during subsys cleanup 843 * because nf_conntrack_find_get can not be used after 844 * conntrack cleanup for the net. 845 */ 846 smp_rmb(); 847 if (ipvs->enable) 848 ip_vs_conn_drop_conntrack(cp); 849 } 850 851 if (unlikely(cp->app != NULL)) 852 ip_vs_unbind_app(cp); 853 ip_vs_unbind_dest(cp); 854 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 855 atomic_dec(&ip_vs_conn_no_cport_cnt); 856 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 857 ip_vs_conn_rcu_free(&cp->rcu_head); 858 else 859 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); 860 atomic_dec(&ipvs->conn_count); 861 return; 862 } 863 864 expire_later: 865 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", 866 atomic_read(&cp->refcnt), 867 atomic_read(&cp->n_control)); 868 869 atomic_inc(&cp->refcnt); 870 cp->timeout = 60*HZ; 871 872 if (ipvs->sync_state & IP_VS_STATE_MASTER) 873 ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); 874 875 __ip_vs_conn_put_timer(cp); 876 } 877 878 /* Modify timer, so that it expires as soon as possible. 879 * Can be called without reference only if under RCU lock. 880 */ 881 void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 882 { 883 /* Using mod_timer_pending will ensure the timer is not 884 * modified after the final del_timer in ip_vs_conn_expire. 885 */ 886 if (timer_pending(&cp->timer) && 887 time_after(cp->timer.expires, jiffies)) 888 mod_timer_pending(&cp->timer, jiffies); 889 } 890 891 892 /* 893 * Create a new connection entry and hash it into the ip_vs_conn_tab 894 */ 895 struct ip_vs_conn * 896 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, 897 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, 898 struct ip_vs_dest *dest, __u32 fwmark) 899 { 900 struct ip_vs_conn *cp; 901 struct netns_ipvs *ipvs = p->ipvs; 902 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, 903 p->protocol); 904 905 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); 906 if (cp == NULL) { 907 IP_VS_ERR_RL("%s(): no memory\n", __func__); 908 return NULL; 909 } 910 911 INIT_HLIST_NODE(&cp->c_list); 912 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 913 cp->ipvs = ipvs; 914 cp->af = p->af; 915 cp->daf = dest_af; 916 cp->protocol = p->protocol; 917 ip_vs_addr_set(p->af, &cp->caddr, p->caddr); 918 cp->cport = p->cport; 919 /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ 920 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, 921 &cp->vaddr, p->vaddr); 922 cp->vport = p->vport; 923 ip_vs_addr_set(cp->daf, &cp->daddr, daddr); 924 cp->dport = dport; 925 cp->flags = flags; 926 cp->fwmark = fwmark; 927 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { 928 ip_vs_pe_get(p->pe); 929 cp->pe = p->pe; 930 cp->pe_data = p->pe_data; 931 cp->pe_data_len = p->pe_data_len; 932 } else { 933 cp->pe = NULL; 934 cp->pe_data = NULL; 935 cp->pe_data_len = 0; 936 } 937 spin_lock_init(&cp->lock); 938 939 /* 940 * Set the entry is referenced by the current thread before hashing 941 * it in the table, so that other thread run ip_vs_random_dropentry 942 * but cannot drop this entry. 943 */ 944 atomic_set(&cp->refcnt, 1); 945 946 cp->control = NULL; 947 atomic_set(&cp->n_control, 0); 948 atomic_set(&cp->in_pkts, 0); 949 950 cp->packet_xmit = NULL; 951 cp->app = NULL; 952 cp->app_data = NULL; 953 /* reset struct ip_vs_seq */ 954 cp->in_seq.delta = 0; 955 cp->out_seq.delta = 0; 956 957 atomic_inc(&ipvs->conn_count); 958 if (flags & IP_VS_CONN_F_NO_CPORT) 959 atomic_inc(&ip_vs_conn_no_cport_cnt); 960 961 /* Bind the connection with a destination server */ 962 cp->dest = NULL; 963 ip_vs_bind_dest(cp, dest); 964 965 /* Set its state and timeout */ 966 cp->state = 0; 967 cp->old_state = 0; 968 cp->timeout = 3*HZ; 969 cp->sync_endtime = jiffies & ~3UL; 970 971 /* Bind its packet transmitter */ 972 #ifdef CONFIG_IP_VS_IPV6 973 if (p->af == AF_INET6) 974 ip_vs_bind_xmit_v6(cp); 975 else 976 #endif 977 ip_vs_bind_xmit(cp); 978 979 if (unlikely(pd && atomic_read(&pd->appcnt))) 980 ip_vs_bind_app(cp, pd->pp); 981 982 /* 983 * Allow conntrack to be preserved. By default, conntrack 984 * is created and destroyed for every packet. 985 * Sometimes keeping conntrack can be useful for 986 * IP_VS_CONN_F_ONE_PACKET too. 987 */ 988 989 if (ip_vs_conntrack_enabled(ipvs)) 990 cp->flags |= IP_VS_CONN_F_NFCT; 991 992 /* Hash it in the ip_vs_conn_tab finally */ 993 ip_vs_conn_hash(cp); 994 995 return cp; 996 } 997 998 /* 999 * /proc/net/ip_vs_conn entries 1000 */ 1001 #ifdef CONFIG_PROC_FS 1002 struct ip_vs_iter_state { 1003 struct seq_net_private p; 1004 struct hlist_head *l; 1005 }; 1006 1007 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 1008 { 1009 int idx; 1010 struct ip_vs_conn *cp; 1011 struct ip_vs_iter_state *iter = seq->private; 1012 1013 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1014 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1015 /* __ip_vs_conn_get() is not needed by 1016 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show 1017 */ 1018 if (pos-- == 0) { 1019 iter->l = &ip_vs_conn_tab[idx]; 1020 return cp; 1021 } 1022 } 1023 cond_resched_rcu(); 1024 } 1025 1026 return NULL; 1027 } 1028 1029 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 1030 __acquires(RCU) 1031 { 1032 struct ip_vs_iter_state *iter = seq->private; 1033 1034 iter->l = NULL; 1035 rcu_read_lock(); 1036 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 1037 } 1038 1039 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1040 { 1041 struct ip_vs_conn *cp = v; 1042 struct ip_vs_iter_state *iter = seq->private; 1043 struct hlist_node *e; 1044 struct hlist_head *l = iter->l; 1045 int idx; 1046 1047 ++*pos; 1048 if (v == SEQ_START_TOKEN) 1049 return ip_vs_conn_array(seq, 0); 1050 1051 /* more on same hash chain? */ 1052 e = rcu_dereference(hlist_next_rcu(&cp->c_list)); 1053 if (e) 1054 return hlist_entry(e, struct ip_vs_conn, c_list); 1055 1056 idx = l - ip_vs_conn_tab; 1057 while (++idx < ip_vs_conn_tab_size) { 1058 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1059 iter->l = &ip_vs_conn_tab[idx]; 1060 return cp; 1061 } 1062 cond_resched_rcu(); 1063 } 1064 iter->l = NULL; 1065 return NULL; 1066 } 1067 1068 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 1069 __releases(RCU) 1070 { 1071 rcu_read_unlock(); 1072 } 1073 1074 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 1075 { 1076 1077 if (v == SEQ_START_TOKEN) 1078 seq_puts(seq, 1079 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 1080 else { 1081 const struct ip_vs_conn *cp = v; 1082 struct net *net = seq_file_net(seq); 1083 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 1084 size_t len = 0; 1085 char dbuf[IP_VS_ADDRSTRLEN]; 1086 1087 if (!net_eq(cp->ipvs->net, net)) 1088 return 0; 1089 if (cp->pe_data) { 1090 pe_data[0] = ' '; 1091 len = strlen(cp->pe->name); 1092 memcpy(pe_data + 1, cp->pe->name, len); 1093 pe_data[len + 1] = ' '; 1094 len += 2; 1095 len += cp->pe->show_pe_data(cp, pe_data + len); 1096 } 1097 pe_data[len] = '\0'; 1098 1099 #ifdef CONFIG_IP_VS_IPV6 1100 if (cp->daf == AF_INET6) 1101 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1102 else 1103 #endif 1104 snprintf(dbuf, sizeof(dbuf), "%08X", 1105 ntohl(cp->daddr.ip)); 1106 1107 #ifdef CONFIG_IP_VS_IPV6 1108 if (cp->af == AF_INET6) 1109 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1110 "%s %04X %-11s %7lu%s\n", 1111 ip_vs_proto_name(cp->protocol), 1112 &cp->caddr.in6, ntohs(cp->cport), 1113 &cp->vaddr.in6, ntohs(cp->vport), 1114 dbuf, ntohs(cp->dport), 1115 ip_vs_state_name(cp->protocol, cp->state), 1116 (cp->timer.expires-jiffies)/HZ, pe_data); 1117 else 1118 #endif 1119 seq_printf(seq, 1120 "%-3s %08X %04X %08X %04X" 1121 " %s %04X %-11s %7lu%s\n", 1122 ip_vs_proto_name(cp->protocol), 1123 ntohl(cp->caddr.ip), ntohs(cp->cport), 1124 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1125 dbuf, ntohs(cp->dport), 1126 ip_vs_state_name(cp->protocol, cp->state), 1127 (cp->timer.expires-jiffies)/HZ, pe_data); 1128 } 1129 return 0; 1130 } 1131 1132 static const struct seq_operations ip_vs_conn_seq_ops = { 1133 .start = ip_vs_conn_seq_start, 1134 .next = ip_vs_conn_seq_next, 1135 .stop = ip_vs_conn_seq_stop, 1136 .show = ip_vs_conn_seq_show, 1137 }; 1138 1139 static int ip_vs_conn_open(struct inode *inode, struct file *file) 1140 { 1141 return seq_open_net(inode, file, &ip_vs_conn_seq_ops, 1142 sizeof(struct ip_vs_iter_state)); 1143 } 1144 1145 static const struct file_operations ip_vs_conn_fops = { 1146 .owner = THIS_MODULE, 1147 .open = ip_vs_conn_open, 1148 .read = seq_read, 1149 .llseek = seq_lseek, 1150 .release = seq_release_net, 1151 }; 1152 1153 static const char *ip_vs_origin_name(unsigned int flags) 1154 { 1155 if (flags & IP_VS_CONN_F_SYNC) 1156 return "SYNC"; 1157 else 1158 return "LOCAL"; 1159 } 1160 1161 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 1162 { 1163 char dbuf[IP_VS_ADDRSTRLEN]; 1164 1165 if (v == SEQ_START_TOKEN) 1166 seq_puts(seq, 1167 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1168 else { 1169 const struct ip_vs_conn *cp = v; 1170 struct net *net = seq_file_net(seq); 1171 1172 if (!net_eq(cp->ipvs->net, net)) 1173 return 0; 1174 1175 #ifdef CONFIG_IP_VS_IPV6 1176 if (cp->daf == AF_INET6) 1177 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6); 1178 else 1179 #endif 1180 snprintf(dbuf, sizeof(dbuf), "%08X", 1181 ntohl(cp->daddr.ip)); 1182 1183 #ifdef CONFIG_IP_VS_IPV6 1184 if (cp->af == AF_INET6) 1185 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " 1186 "%s %04X %-11s %-6s %7lu\n", 1187 ip_vs_proto_name(cp->protocol), 1188 &cp->caddr.in6, ntohs(cp->cport), 1189 &cp->vaddr.in6, ntohs(cp->vport), 1190 dbuf, ntohs(cp->dport), 1191 ip_vs_state_name(cp->protocol, cp->state), 1192 ip_vs_origin_name(cp->flags), 1193 (cp->timer.expires-jiffies)/HZ); 1194 else 1195 #endif 1196 seq_printf(seq, 1197 "%-3s %08X %04X %08X %04X " 1198 "%s %04X %-11s %-6s %7lu\n", 1199 ip_vs_proto_name(cp->protocol), 1200 ntohl(cp->caddr.ip), ntohs(cp->cport), 1201 ntohl(cp->vaddr.ip), ntohs(cp->vport), 1202 dbuf, ntohs(cp->dport), 1203 ip_vs_state_name(cp->protocol, cp->state), 1204 ip_vs_origin_name(cp->flags), 1205 (cp->timer.expires-jiffies)/HZ); 1206 } 1207 return 0; 1208 } 1209 1210 static const struct seq_operations ip_vs_conn_sync_seq_ops = { 1211 .start = ip_vs_conn_seq_start, 1212 .next = ip_vs_conn_seq_next, 1213 .stop = ip_vs_conn_seq_stop, 1214 .show = ip_vs_conn_sync_seq_show, 1215 }; 1216 1217 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 1218 { 1219 return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, 1220 sizeof(struct ip_vs_iter_state)); 1221 } 1222 1223 static const struct file_operations ip_vs_conn_sync_fops = { 1224 .owner = THIS_MODULE, 1225 .open = ip_vs_conn_sync_open, 1226 .read = seq_read, 1227 .llseek = seq_lseek, 1228 .release = seq_release_net, 1229 }; 1230 1231 #endif 1232 1233 1234 /* 1235 * Randomly drop connection entries before running out of memory 1236 */ 1237 static inline int todrop_entry(struct ip_vs_conn *cp) 1238 { 1239 /* 1240 * The drop rate array needs tuning for real environments. 1241 * Called from timer bh only => no locking 1242 */ 1243 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 1244 static char todrop_counter[9] = {0}; 1245 int i; 1246 1247 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 1248 This will leave enough time for normal connection to get 1249 through. */ 1250 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 1251 return 0; 1252 1253 /* Don't drop the entry if its number of incoming packets is not 1254 located in [0, 8] */ 1255 i = atomic_read(&cp->in_pkts); 1256 if (i > 8 || i < 0) return 0; 1257 1258 if (!todrop_rate[i]) return 0; 1259 if (--todrop_counter[i] > 0) return 0; 1260 1261 todrop_counter[i] = todrop_rate[i]; 1262 return 1; 1263 } 1264 1265 static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) 1266 { 1267 struct ip_vs_service *svc; 1268 1269 if (!cp->dest) 1270 return false; 1271 svc = rcu_dereference(cp->dest->svc); 1272 return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); 1273 } 1274 1275 /* Called from keventd and must protect itself from softirqs */ 1276 void ip_vs_random_dropentry(struct netns_ipvs *ipvs) 1277 { 1278 int idx; 1279 struct ip_vs_conn *cp, *cp_c; 1280 1281 rcu_read_lock(); 1282 /* 1283 * Randomly scan 1/32 of the whole table every second 1284 */ 1285 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1286 unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; 1287 1288 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { 1289 if (cp->ipvs != ipvs) 1290 continue; 1291 if (cp->flags & IP_VS_CONN_F_TEMPLATE) { 1292 if (atomic_read(&cp->n_control) || 1293 !ip_vs_conn_ops_mode(cp)) 1294 continue; 1295 else 1296 /* connection template of OPS */ 1297 goto try_drop; 1298 } 1299 if (cp->protocol == IPPROTO_TCP) { 1300 switch(cp->state) { 1301 case IP_VS_TCP_S_SYN_RECV: 1302 case IP_VS_TCP_S_SYNACK: 1303 break; 1304 1305 case IP_VS_TCP_S_ESTABLISHED: 1306 if (todrop_entry(cp)) 1307 break; 1308 continue; 1309 1310 default: 1311 continue; 1312 } 1313 } else if (cp->protocol == IPPROTO_SCTP) { 1314 switch (cp->state) { 1315 case IP_VS_SCTP_S_INIT1: 1316 case IP_VS_SCTP_S_INIT: 1317 break; 1318 case IP_VS_SCTP_S_ESTABLISHED: 1319 if (todrop_entry(cp)) 1320 break; 1321 continue; 1322 default: 1323 continue; 1324 } 1325 } else { 1326 try_drop: 1327 if (!todrop_entry(cp)) 1328 continue; 1329 } 1330 1331 IP_VS_DBG(4, "del connection\n"); 1332 ip_vs_conn_expire_now(cp); 1333 cp_c = cp->control; 1334 /* cp->control is valid only with reference to cp */ 1335 if (cp_c && __ip_vs_conn_get(cp)) { 1336 IP_VS_DBG(4, "del conn template\n"); 1337 ip_vs_conn_expire_now(cp_c); 1338 __ip_vs_conn_put(cp); 1339 } 1340 } 1341 cond_resched_rcu(); 1342 } 1343 rcu_read_unlock(); 1344 } 1345 1346 1347 /* 1348 * Flush all the connection entries in the ip_vs_conn_tab 1349 */ 1350 static void ip_vs_conn_flush(struct netns_ipvs *ipvs) 1351 { 1352 int idx; 1353 struct ip_vs_conn *cp, *cp_c; 1354 1355 flush_again: 1356 rcu_read_lock(); 1357 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1358 1359 hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { 1360 if (cp->ipvs != ipvs) 1361 continue; 1362 IP_VS_DBG(4, "del connection\n"); 1363 ip_vs_conn_expire_now(cp); 1364 cp_c = cp->control; 1365 /* cp->control is valid only with reference to cp */ 1366 if (cp_c && __ip_vs_conn_get(cp)) { 1367 IP_VS_DBG(4, "del conn template\n"); 1368 ip_vs_conn_expire_now(cp_c); 1369 __ip_vs_conn_put(cp); 1370 } 1371 } 1372 cond_resched_rcu(); 1373 } 1374 rcu_read_unlock(); 1375 1376 /* the counter may be not NULL, because maybe some conn entries 1377 are run by slow timer handler or unhashed but still referred */ 1378 if (atomic_read(&ipvs->conn_count) != 0) { 1379 schedule(); 1380 goto flush_again; 1381 } 1382 } 1383 /* 1384 * per netns init and exit 1385 */ 1386 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) 1387 { 1388 atomic_set(&ipvs->conn_count, 0); 1389 1390 proc_create("ip_vs_conn", 0, ipvs->net->proc_net, &ip_vs_conn_fops); 1391 proc_create("ip_vs_conn_sync", 0, ipvs->net->proc_net, 1392 &ip_vs_conn_sync_fops); 1393 return 0; 1394 } 1395 1396 void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) 1397 { 1398 /* flush all the connection entries first */ 1399 ip_vs_conn_flush(ipvs); 1400 remove_proc_entry("ip_vs_conn", ipvs->net->proc_net); 1401 remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net); 1402 } 1403 1404 int __init ip_vs_conn_init(void) 1405 { 1406 int idx; 1407 1408 /* Compute size and mask */ 1409 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1410 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1411 1412 /* 1413 * Allocate the connection hash table and initialize its list heads 1414 */ 1415 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); 1416 if (!ip_vs_conn_tab) 1417 return -ENOMEM; 1418 1419 /* Allocate ip_vs_conn slab cache */ 1420 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 1421 sizeof(struct ip_vs_conn), 0, 1422 SLAB_HWCACHE_ALIGN, NULL); 1423 if (!ip_vs_conn_cachep) { 1424 vfree(ip_vs_conn_tab); 1425 return -ENOMEM; 1426 } 1427 1428 pr_info("Connection hash table configured " 1429 "(size=%d, memory=%ldKbytes)\n", 1430 ip_vs_conn_tab_size, 1431 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1432 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1433 sizeof(struct ip_vs_conn)); 1434 1435 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) 1436 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); 1437 1438 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1439 spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); 1440 } 1441 1442 /* calculate the random value for connection hash */ 1443 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1444 1445 return 0; 1446 } 1447 1448 void ip_vs_conn_cleanup(void) 1449 { 1450 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ 1451 rcu_barrier(); 1452 /* Release the empty cache */ 1453 kmem_cache_destroy(ip_vs_conn_cachep); 1454 vfree(ip_vs_conn_tab); 1455 } 1456