1 /* 2 * IPVS: Locality-Based Least-Connection scheduling module 3 * 4 * Authors: Wensong Zhang <wensong@gnuchina.org> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Changes: 12 * Martin Hamilton : fixed the terrible locking bugs 13 * *lock(tbl->lock) ==> *lock(&tbl->lock) 14 * Wensong Zhang : fixed the uninitialized tbl->lock bug 15 * Wensong Zhang : added doing full expiration check to 16 * collect stale entries of 24+ hours when 17 * no partial expire check in a half hour 18 * Julian Anastasov : replaced del_timer call with del_timer_sync 19 * to avoid the possible race between timer 20 * handler and del_timer thread in SMP 21 * 22 */ 23 24 /* 25 * The lblc algorithm is as follows (pseudo code): 26 * 27 * if cachenode[dest_ip] is null then 28 * n, cachenode[dest_ip] <- {weighted least-conn node}; 29 * else 30 * n <- cachenode[dest_ip]; 31 * if (n is dead) OR 32 * (n.conns>n.weight AND 33 * there is a node m with m.conns<m.weight/2) then 34 * n, cachenode[dest_ip] <- {weighted least-conn node}; 35 * 36 * return n; 37 * 38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 39 * me to write this module. 40 */ 41 42 #define KMSG_COMPONENT "IPVS" 43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 44 45 #include <linux/ip.h> 46 #include <linux/slab.h> 47 #include <linux/module.h> 48 #include <linux/kernel.h> 49 #include <linux/skbuff.h> 50 #include <linux/jiffies.h> 51 #include <linux/hash.h> 52 53 /* for sysctl */ 54 #include <linux/fs.h> 55 #include <linux/sysctl.h> 56 57 #include <net/ip_vs.h> 58 59 60 /* 61 * It is for garbage collection of stale IPVS lblc entries, 62 * when the table is full. 63 */ 64 #define CHECK_EXPIRE_INTERVAL (60*HZ) 65 #define ENTRY_TIMEOUT (6*60*HZ) 66 67 #define DEFAULT_EXPIRATION (24*60*60*HZ) 68 69 /* 70 * It is for full expiration check. 71 * When there is no partial expiration check (garbage collection) 72 * in a half hour, do a full expiration check to collect stale 73 * entries that haven't been touched for a day. 74 */ 75 #define COUNT_FOR_FULL_EXPIRATION 30 76 77 78 /* 79 * for IPVS lblc entry hash table 80 */ 81 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS 82 #define CONFIG_IP_VS_LBLC_TAB_BITS 10 83 #endif 84 #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 85 #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 86 #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 87 88 89 /* 90 * IPVS lblc entry represents an association between destination 91 * IP address and its destination server 92 */ 93 struct ip_vs_lblc_entry { 94 struct hlist_node list; 95 int af; /* address family */ 96 union nf_inet_addr addr; /* destination IP address */ 97 struct ip_vs_dest *dest; /* real server (cache) */ 98 unsigned long lastuse; /* last used time */ 99 struct rcu_head rcu_head; 100 }; 101 102 103 /* 104 * IPVS lblc hash table 105 */ 106 struct ip_vs_lblc_table { 107 struct rcu_head rcu_head; 108 struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 109 struct timer_list periodic_timer; /* collect stale entries */ 110 struct ip_vs_service *svc; /* pointer back to service */ 111 atomic_t entries; /* number of entries */ 112 int max_size; /* maximum size of entries */ 113 int rover; /* rover for expire check */ 114 int counter; /* counter for no expire */ 115 bool dead; 116 }; 117 118 119 /* 120 * IPVS LBLC sysctl table 121 */ 122 #ifdef CONFIG_SYSCTL 123 static struct ctl_table vs_vars_table[] = { 124 { 125 .procname = "lblc_expiration", 126 .data = NULL, 127 .maxlen = sizeof(int), 128 .mode = 0644, 129 .proc_handler = proc_dointvec_jiffies, 130 }, 131 { } 132 }; 133 #endif 134 135 static void ip_vs_lblc_rcu_free(struct rcu_head *head) 136 { 137 struct ip_vs_lblc_entry *en = container_of(head, 138 struct ip_vs_lblc_entry, 139 rcu_head); 140 141 ip_vs_dest_put_and_free(en->dest); 142 kfree(en); 143 } 144 145 static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en) 146 { 147 hlist_del_rcu(&en->list); 148 call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free); 149 } 150 151 /* 152 * Returns hash value for IPVS LBLC entry 153 */ 154 static inline unsigned int 155 ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) 156 { 157 __be32 addr_fold = addr->ip; 158 159 #ifdef CONFIG_IP_VS_IPV6 160 if (af == AF_INET6) 161 addr_fold = addr->ip6[0]^addr->ip6[1]^ 162 addr->ip6[2]^addr->ip6[3]; 163 #endif 164 return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); 165 } 166 167 168 /* 169 * Hash an entry in the ip_vs_lblc_table. 170 * returns bool success. 171 */ 172 static void 173 ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 174 { 175 unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr); 176 177 hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); 178 atomic_inc(&tbl->entries); 179 } 180 181 182 /* Get ip_vs_lblc_entry associated with supplied parameters. */ 183 static inline struct ip_vs_lblc_entry * 184 ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, 185 const union nf_inet_addr *addr) 186 { 187 unsigned int hash = ip_vs_lblc_hashkey(af, addr); 188 struct ip_vs_lblc_entry *en; 189 190 hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) 191 if (ip_vs_addr_equal(af, &en->addr, addr)) 192 return en; 193 194 return NULL; 195 } 196 197 198 /* 199 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP 200 * address to a server. Called under spin lock. 201 */ 202 static inline struct ip_vs_lblc_entry * 203 ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, 204 u16 af, struct ip_vs_dest *dest) 205 { 206 struct ip_vs_lblc_entry *en; 207 208 en = ip_vs_lblc_get(af, tbl, daddr); 209 if (en) { 210 if (en->dest == dest) 211 return en; 212 ip_vs_lblc_del(en); 213 } 214 en = kmalloc(sizeof(*en), GFP_ATOMIC); 215 if (!en) 216 return NULL; 217 218 en->af = af; 219 ip_vs_addr_copy(af, &en->addr, daddr); 220 en->lastuse = jiffies; 221 222 ip_vs_dest_hold(dest); 223 en->dest = dest; 224 225 ip_vs_lblc_hash(tbl, en); 226 227 return en; 228 } 229 230 231 /* 232 * Flush all the entries of the specified table. 233 */ 234 static void ip_vs_lblc_flush(struct ip_vs_service *svc) 235 { 236 struct ip_vs_lblc_table *tbl = svc->sched_data; 237 struct ip_vs_lblc_entry *en; 238 struct hlist_node *next; 239 int i; 240 241 spin_lock_bh(&svc->sched_lock); 242 tbl->dead = true; 243 for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { 244 hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { 245 ip_vs_lblc_del(en); 246 atomic_dec(&tbl->entries); 247 } 248 } 249 spin_unlock_bh(&svc->sched_lock); 250 } 251 252 static int sysctl_lblc_expiration(struct ip_vs_service *svc) 253 { 254 #ifdef CONFIG_SYSCTL 255 return svc->ipvs->sysctl_lblc_expiration; 256 #else 257 return DEFAULT_EXPIRATION; 258 #endif 259 } 260 261 static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) 262 { 263 struct ip_vs_lblc_table *tbl = svc->sched_data; 264 struct ip_vs_lblc_entry *en; 265 struct hlist_node *next; 266 unsigned long now = jiffies; 267 int i, j; 268 269 for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { 270 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 271 272 spin_lock(&svc->sched_lock); 273 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { 274 if (time_before(now, 275 en->lastuse + 276 sysctl_lblc_expiration(svc))) 277 continue; 278 279 ip_vs_lblc_del(en); 280 atomic_dec(&tbl->entries); 281 } 282 spin_unlock(&svc->sched_lock); 283 } 284 tbl->rover = j; 285 } 286 287 288 /* 289 * Periodical timer handler for IPVS lblc table 290 * It is used to collect stale entries when the number of entries 291 * exceeds the maximum size of the table. 292 * 293 * Fixme: we probably need more complicated algorithm to collect 294 * entries that have not been used for a long time even 295 * if the number of entries doesn't exceed the maximum size 296 * of the table. 297 * The full expiration check is for this purpose now. 298 */ 299 static void ip_vs_lblc_check_expire(struct timer_list *t) 300 { 301 struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); 302 struct ip_vs_service *svc = tbl->svc; 303 unsigned long now = jiffies; 304 int goal; 305 int i, j; 306 struct ip_vs_lblc_entry *en; 307 struct hlist_node *next; 308 309 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 310 /* do full expiration check */ 311 ip_vs_lblc_full_check(svc); 312 tbl->counter = 1; 313 goto out; 314 } 315 316 if (atomic_read(&tbl->entries) <= tbl->max_size) { 317 tbl->counter++; 318 goto out; 319 } 320 321 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 322 if (goal > tbl->max_size/2) 323 goal = tbl->max_size/2; 324 325 for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) { 326 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 327 328 spin_lock(&svc->sched_lock); 329 hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { 330 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 331 continue; 332 333 ip_vs_lblc_del(en); 334 atomic_dec(&tbl->entries); 335 goal--; 336 } 337 spin_unlock(&svc->sched_lock); 338 if (goal <= 0) 339 break; 340 } 341 tbl->rover = j; 342 343 out: 344 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); 345 } 346 347 348 static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 349 { 350 int i; 351 struct ip_vs_lblc_table *tbl; 352 353 /* 354 * Allocate the ip_vs_lblc_table for this service 355 */ 356 tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); 357 if (tbl == NULL) 358 return -ENOMEM; 359 360 svc->sched_data = tbl; 361 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for " 362 "current service\n", sizeof(*tbl)); 363 364 /* 365 * Initialize the hash buckets 366 */ 367 for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { 368 INIT_HLIST_HEAD(&tbl->bucket[i]); 369 } 370 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 371 tbl->rover = 0; 372 tbl->counter = 1; 373 tbl->dead = false; 374 tbl->svc = svc; 375 atomic_set(&tbl->entries, 0); 376 377 /* 378 * Hook periodic timer for garbage collection 379 */ 380 timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0); 381 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); 382 383 return 0; 384 } 385 386 387 static void ip_vs_lblc_done_svc(struct ip_vs_service *svc) 388 { 389 struct ip_vs_lblc_table *tbl = svc->sched_data; 390 391 /* remove periodic timer */ 392 del_timer_sync(&tbl->periodic_timer); 393 394 /* got to clean up table entries here */ 395 ip_vs_lblc_flush(svc); 396 397 /* release the table itself */ 398 kfree_rcu(tbl, rcu_head); 399 IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n", 400 sizeof(*tbl)); 401 } 402 403 404 static inline struct ip_vs_dest * 405 __ip_vs_lblc_schedule(struct ip_vs_service *svc) 406 { 407 struct ip_vs_dest *dest, *least; 408 int loh, doh; 409 410 /* 411 * We use the following formula to estimate the load: 412 * (dest overhead) / dest->weight 413 * 414 * Remember -- no floats in kernel mode!!! 415 * The comparison of h1*w2 > h2*w1 is equivalent to that of 416 * h1/w1 > h2/w2 417 * if every weight is larger than zero. 418 * 419 * The server with weight=0 is quiesced and will not receive any 420 * new connection. 421 */ 422 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 423 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 424 continue; 425 if (atomic_read(&dest->weight) > 0) { 426 least = dest; 427 loh = ip_vs_dest_conn_overhead(least); 428 goto nextstage; 429 } 430 } 431 return NULL; 432 433 /* 434 * Find the destination with the least load. 435 */ 436 nextstage: 437 list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { 438 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 439 continue; 440 441 doh = ip_vs_dest_conn_overhead(dest); 442 if ((__s64)loh * atomic_read(&dest->weight) > 443 (__s64)doh * atomic_read(&least->weight)) { 444 least = dest; 445 loh = doh; 446 } 447 } 448 449 IP_VS_DBG_BUF(6, "LBLC: server %s:%d " 450 "activeconns %d refcnt %d weight %d overhead %d\n", 451 IP_VS_DBG_ADDR(least->af, &least->addr), 452 ntohs(least->port), 453 atomic_read(&least->activeconns), 454 refcount_read(&least->refcnt), 455 atomic_read(&least->weight), loh); 456 457 return least; 458 } 459 460 461 /* 462 * If this destination server is overloaded and there is a less loaded 463 * server, then return true. 464 */ 465 static inline int 466 is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 467 { 468 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 469 struct ip_vs_dest *d; 470 471 list_for_each_entry_rcu(d, &svc->destinations, n_list) { 472 if (atomic_read(&d->activeconns)*2 473 < atomic_read(&d->weight)) { 474 return 1; 475 } 476 } 477 } 478 return 0; 479 } 480 481 482 /* 483 * Locality-Based (weighted) Least-Connection scheduling 484 */ 485 static struct ip_vs_dest * 486 ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, 487 struct ip_vs_iphdr *iph) 488 { 489 struct ip_vs_lblc_table *tbl = svc->sched_data; 490 struct ip_vs_dest *dest = NULL; 491 struct ip_vs_lblc_entry *en; 492 493 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 494 495 /* First look in our cache */ 496 en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr); 497 if (en) { 498 /* We only hold a read lock, but this is atomic */ 499 en->lastuse = jiffies; 500 501 /* 502 * If the destination is not available, i.e. it's in the trash, 503 * we must ignore it, as it may be removed from under our feet, 504 * if someone drops our reference count. Our caller only makes 505 * sure that destinations, that are not in the trash, are not 506 * moved to the trash, while we are scheduling. But anyone can 507 * free up entries from the trash at any time. 508 */ 509 510 dest = en->dest; 511 if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && 512 atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) 513 goto out; 514 } 515 516 /* No cache entry or it is invalid, time to schedule */ 517 dest = __ip_vs_lblc_schedule(svc); 518 if (!dest) { 519 ip_vs_scheduler_err(svc, "no destination available"); 520 return NULL; 521 } 522 523 /* If we fail to create a cache entry, we'll just use the valid dest */ 524 spin_lock_bh(&svc->sched_lock); 525 if (!tbl->dead) 526 ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest); 527 spin_unlock_bh(&svc->sched_lock); 528 529 out: 530 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", 531 IP_VS_DBG_ADDR(svc->af, &iph->daddr), 532 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); 533 534 return dest; 535 } 536 537 538 /* 539 * IPVS LBLC Scheduler structure 540 */ 541 static struct ip_vs_scheduler ip_vs_lblc_scheduler = { 542 .name = "lblc", 543 .refcnt = ATOMIC_INIT(0), 544 .module = THIS_MODULE, 545 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 546 .init_service = ip_vs_lblc_init_svc, 547 .done_service = ip_vs_lblc_done_svc, 548 .schedule = ip_vs_lblc_schedule, 549 }; 550 551 /* 552 * per netns init. 553 */ 554 #ifdef CONFIG_SYSCTL 555 static int __net_init __ip_vs_lblc_init(struct net *net) 556 { 557 struct netns_ipvs *ipvs = net_ipvs(net); 558 559 if (!ipvs) 560 return -ENOENT; 561 562 if (!net_eq(net, &init_net)) { 563 ipvs->lblc_ctl_table = kmemdup(vs_vars_table, 564 sizeof(vs_vars_table), 565 GFP_KERNEL); 566 if (ipvs->lblc_ctl_table == NULL) 567 return -ENOMEM; 568 569 /* Don't export sysctls to unprivileged users */ 570 if (net->user_ns != &init_user_ns) 571 ipvs->lblc_ctl_table[0].procname = NULL; 572 573 } else 574 ipvs->lblc_ctl_table = vs_vars_table; 575 ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; 576 ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; 577 578 ipvs->lblc_ctl_header = 579 register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table); 580 if (!ipvs->lblc_ctl_header) { 581 if (!net_eq(net, &init_net)) 582 kfree(ipvs->lblc_ctl_table); 583 return -ENOMEM; 584 } 585 586 return 0; 587 } 588 589 static void __net_exit __ip_vs_lblc_exit(struct net *net) 590 { 591 struct netns_ipvs *ipvs = net_ipvs(net); 592 593 unregister_net_sysctl_table(ipvs->lblc_ctl_header); 594 595 if (!net_eq(net, &init_net)) 596 kfree(ipvs->lblc_ctl_table); 597 } 598 599 #else 600 601 static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } 602 static void __net_exit __ip_vs_lblc_exit(struct net *net) { } 603 604 #endif 605 606 static struct pernet_operations ip_vs_lblc_ops = { 607 .init = __ip_vs_lblc_init, 608 .exit = __ip_vs_lblc_exit, 609 }; 610 611 static int __init ip_vs_lblc_init(void) 612 { 613 int ret; 614 615 ret = register_pernet_subsys(&ip_vs_lblc_ops); 616 if (ret) 617 return ret; 618 619 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 620 if (ret) 621 unregister_pernet_subsys(&ip_vs_lblc_ops); 622 return ret; 623 } 624 625 static void __exit ip_vs_lblc_cleanup(void) 626 { 627 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 628 unregister_pernet_subsys(&ip_vs_lblc_ops); 629 rcu_barrier(); 630 } 631 632 633 module_init(ip_vs_lblc_init); 634 module_exit(ip_vs_lblc_cleanup); 635 MODULE_LICENSE("GPL"); 636