1 // SPDX-License-Identifier: GPL-2.0 2 /* IPVS: Maglev Hashing scheduling module 3 * 4 * Authors: Inju Song <inju.song@navercorp.com> 5 * 6 */ 7 8 /* The mh algorithm is to assign a preference list of all the lookup 9 * table positions to each destination and populate the table with 10 * the most-preferred position of destinations. Then it is to select 11 * destination with the hash key of source IP address through looking 12 * up a the lookup table. 13 * 14 * The algorithm is detailed in: 15 * [3.4 Consistent Hasing] 16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf 17 * 18 */ 19 20 #define KMSG_COMPONENT "IPVS" 21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 22 23 #include <linux/ip.h> 24 #include <linux/slab.h> 25 #include <linux/module.h> 26 #include <linux/kernel.h> 27 #include <linux/skbuff.h> 28 29 #include <net/ip_vs.h> 30 31 #include <linux/siphash.h> 32 #include <linux/bitops.h> 33 #include <linux/gcd.h> 34 35 #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ 36 #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ 37 38 struct ip_vs_mh_lookup { 39 struct ip_vs_dest __rcu *dest; /* real server (cache) */ 40 }; 41 42 struct ip_vs_mh_dest_setup { 43 unsigned int offset; /* starting offset */ 44 unsigned int skip; /* skip */ 45 unsigned int perm; /* next_offset */ 46 int turns; /* weight / gcd() and rshift */ 47 }; 48 49 /* Available prime numbers for MH table */ 50 static int primes[] = {251, 509, 1021, 2039, 4093, 51 8191, 16381, 32749, 65521, 131071}; 52 53 /* For IPVS MH entry hash table */ 54 #ifndef CONFIG_IP_VS_MH_TAB_INDEX 55 #define CONFIG_IP_VS_MH_TAB_INDEX 12 56 #endif 57 #define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) 58 #define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) 59 #define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] 60 61 struct ip_vs_mh_state { 62 struct rcu_head rcu_head; 63 struct ip_vs_mh_lookup *lookup; 64 struct ip_vs_mh_dest_setup *dest_setup; 65 hsiphash_key_t hash1, hash2; 66 int gcd; 67 int rshift; 68 }; 69 70 static inline void generate_hash_secret(hsiphash_key_t *hash1, 71 hsiphash_key_t *hash2) 72 { 73 hash1->key[0] = 2654435761UL; 74 hash1->key[1] = 2654435761UL; 75 76 hash2->key[0] = 2654446892UL; 77 hash2->key[1] = 2654446892UL; 78 } 79 80 /* Helper function to determine if server is unavailable */ 81 static inline bool is_unavailable(struct ip_vs_dest *dest) 82 { 83 return atomic_read(&dest->weight) <= 0 || 84 dest->flags & IP_VS_DEST_F_OVERLOAD; 85 } 86 87 /* Returns hash value for IPVS MH entry */ 88 static inline unsigned int 89 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, 90 __be16 port, hsiphash_key_t *key, unsigned int offset) 91 { 92 unsigned int v; 93 __be32 addr_fold = addr->ip; 94 95 #ifdef CONFIG_IP_VS_IPV6 96 if (af == AF_INET6) 97 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ 98 addr->ip6[2] ^ addr->ip6[3]; 99 #endif 100 v = (offset + ntohs(port) + ntohl(addr_fold)); 101 return hsiphash(&v, sizeof(v), key); 102 } 103 104 /* Reset all the hash buckets of the specified table. */ 105 static void ip_vs_mh_reset(struct ip_vs_mh_state *s) 106 { 107 int i; 108 struct ip_vs_mh_lookup *l; 109 struct ip_vs_dest *dest; 110 111 l = &s->lookup[0]; 112 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { 113 dest = rcu_dereference_protected(l->dest, 1); 114 if (dest) { 115 ip_vs_dest_put(dest); 116 RCU_INIT_POINTER(l->dest, NULL); 117 } 118 l++; 119 } 120 } 121 122 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, 123 struct ip_vs_service *svc) 124 { 125 struct list_head *p; 126 struct ip_vs_mh_dest_setup *ds; 127 struct ip_vs_dest *dest; 128 int lw; 129 130 /* If gcd is smaller then 1, number of dests or 131 * all last_weight of dests are zero. So, skip 132 * permutation for the dests. 133 */ 134 if (s->gcd < 1) 135 return 0; 136 137 /* Set dest_setup for the dests permutation */ 138 p = &svc->destinations; 139 ds = &s->dest_setup[0]; 140 while ((p = p->next) != &svc->destinations) { 141 dest = list_entry(p, struct ip_vs_dest, n_list); 142 143 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, 144 dest->port, &s->hash1, 0) % 145 IP_VS_MH_TAB_SIZE; 146 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, 147 dest->port, &s->hash2, 0) % 148 (IP_VS_MH_TAB_SIZE - 1) + 1; 149 ds->perm = ds->offset; 150 151 lw = atomic_read(&dest->last_weight); 152 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); 153 ds++; 154 } 155 156 return 0; 157 } 158 159 static int ip_vs_mh_populate(struct ip_vs_mh_state *s, 160 struct ip_vs_service *svc) 161 { 162 int n, c, dt_count; 163 unsigned long *table; 164 struct list_head *p; 165 struct ip_vs_mh_dest_setup *ds; 166 struct ip_vs_dest *dest, *new_dest; 167 168 /* If gcd is smaller then 1, number of dests or 169 * all last_weight of dests are zero. So, skip 170 * the population for the dests and reset lookup table. 171 */ 172 if (s->gcd < 1) { 173 ip_vs_mh_reset(s); 174 return 0; 175 } 176 177 table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), 178 sizeof(unsigned long), GFP_KERNEL); 179 if (!table) 180 return -ENOMEM; 181 182 p = &svc->destinations; 183 n = 0; 184 dt_count = 0; 185 while (n < IP_VS_MH_TAB_SIZE) { 186 if (p == &svc->destinations) 187 p = p->next; 188 189 ds = &s->dest_setup[0]; 190 while (p != &svc->destinations) { 191 /* Ignore added server with zero weight */ 192 if (ds->turns < 1) { 193 p = p->next; 194 ds++; 195 continue; 196 } 197 198 c = ds->perm; 199 while (test_bit(c, table)) { 200 /* Add skip, mod IP_VS_MH_TAB_SIZE */ 201 ds->perm += ds->skip; 202 if (ds->perm >= IP_VS_MH_TAB_SIZE) 203 ds->perm -= IP_VS_MH_TAB_SIZE; 204 c = ds->perm; 205 } 206 207 __set_bit(c, table); 208 209 dest = rcu_dereference_protected(s->lookup[c].dest, 1); 210 new_dest = list_entry(p, struct ip_vs_dest, n_list); 211 if (dest != new_dest) { 212 if (dest) 213 ip_vs_dest_put(dest); 214 ip_vs_dest_hold(new_dest); 215 RCU_INIT_POINTER(s->lookup[c].dest, new_dest); 216 } 217 218 if (++n == IP_VS_MH_TAB_SIZE) 219 goto out; 220 221 if (++dt_count >= ds->turns) { 222 dt_count = 0; 223 p = p->next; 224 ds++; 225 } 226 } 227 } 228 229 out: 230 kfree(table); 231 return 0; 232 } 233 234 /* Get ip_vs_dest associated with supplied parameters. */ 235 static inline struct ip_vs_dest * 236 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, 237 const union nf_inet_addr *addr, __be16 port) 238 { 239 unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) 240 % IP_VS_MH_TAB_SIZE; 241 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); 242 243 return (!dest || is_unavailable(dest)) ? NULL : dest; 244 } 245 246 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */ 247 static inline struct ip_vs_dest * 248 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, 249 const union nf_inet_addr *addr, __be16 port) 250 { 251 unsigned int offset, roffset; 252 unsigned int hash, ihash; 253 struct ip_vs_dest *dest; 254 255 /* First try the dest it's supposed to go to */ 256 ihash = ip_vs_mh_hashkey(svc->af, addr, port, 257 &s->hash1, 0) % IP_VS_MH_TAB_SIZE; 258 dest = rcu_dereference(s->lookup[ihash].dest); 259 if (!dest) 260 return NULL; 261 if (!is_unavailable(dest)) 262 return dest; 263 264 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", 265 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); 266 267 /* If the original dest is unavailable, loop around the table 268 * starting from ihash to find a new dest 269 */ 270 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { 271 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; 272 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 273 roffset) % IP_VS_MH_TAB_SIZE; 274 dest = rcu_dereference(s->lookup[hash].dest); 275 if (!dest) 276 break; 277 if (!is_unavailable(dest)) 278 return dest; 279 IP_VS_DBG_BUF(6, 280 "MH: selected unavailable server %s:%u (offset %u), reselecting", 281 IP_VS_DBG_ADDR(dest->af, &dest->addr), 282 ntohs(dest->port), roffset); 283 } 284 285 return NULL; 286 } 287 288 /* Assign all the hash buckets of the specified table with the service. */ 289 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, 290 struct ip_vs_service *svc) 291 { 292 int ret; 293 294 if (svc->num_dests > IP_VS_MH_TAB_SIZE) 295 return -EINVAL; 296 297 if (svc->num_dests >= 1) { 298 s->dest_setup = kcalloc(svc->num_dests, 299 sizeof(struct ip_vs_mh_dest_setup), 300 GFP_KERNEL); 301 if (!s->dest_setup) 302 return -ENOMEM; 303 } 304 305 ip_vs_mh_permutate(s, svc); 306 307 ret = ip_vs_mh_populate(s, svc); 308 if (ret < 0) 309 goto out; 310 311 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", 312 IP_VS_DBG_ADDR(svc->af, &svc->addr), 313 ntohs(svc->port)); 314 315 out: 316 if (svc->num_dests >= 1) { 317 kfree(s->dest_setup); 318 s->dest_setup = NULL; 319 } 320 return ret; 321 } 322 323 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) 324 { 325 struct ip_vs_dest *dest; 326 int weight; 327 int g = 0; 328 329 list_for_each_entry(dest, &svc->destinations, n_list) { 330 weight = atomic_read(&dest->last_weight); 331 if (weight > 0) { 332 if (g > 0) 333 g = gcd(weight, g); 334 else 335 g = weight; 336 } 337 } 338 return g; 339 } 340 341 /* To avoid assigning huge weight for the MH table, 342 * calculate shift value with gcd. 343 */ 344 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) 345 { 346 struct ip_vs_dest *dest; 347 int new_weight, weight = 0; 348 int mw, shift; 349 350 /* If gcd is smaller then 1, number of dests or 351 * all last_weight of dests are zero. So, return 352 * shift value as zero. 353 */ 354 if (gcd < 1) 355 return 0; 356 357 list_for_each_entry(dest, &svc->destinations, n_list) { 358 new_weight = atomic_read(&dest->last_weight); 359 if (new_weight > weight) 360 weight = new_weight; 361 } 362 363 /* Because gcd is greater than zero, 364 * the maximum weight and gcd are always greater than zero 365 */ 366 mw = weight / gcd; 367 368 /* shift = occupied bits of weight/gcd - MH highest bits */ 369 shift = fls(mw) - IP_VS_MH_TAB_BITS; 370 return (shift >= 0) ? shift : 0; 371 } 372 373 static void ip_vs_mh_state_free(struct rcu_head *head) 374 { 375 struct ip_vs_mh_state *s; 376 377 s = container_of(head, struct ip_vs_mh_state, rcu_head); 378 kfree(s->lookup); 379 kfree(s); 380 } 381 382 static int ip_vs_mh_init_svc(struct ip_vs_service *svc) 383 { 384 int ret; 385 struct ip_vs_mh_state *s; 386 387 /* Allocate the MH table for this service */ 388 s = kzalloc(sizeof(*s), GFP_KERNEL); 389 if (!s) 390 return -ENOMEM; 391 392 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), 393 GFP_KERNEL); 394 if (!s->lookup) { 395 kfree(s); 396 return -ENOMEM; 397 } 398 399 generate_hash_secret(&s->hash1, &s->hash2); 400 s->gcd = ip_vs_mh_gcd_weight(svc); 401 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); 402 403 IP_VS_DBG(6, 404 "MH lookup table (memory=%zdbytes) allocated for current service\n", 405 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); 406 407 /* Assign the lookup table with current dests */ 408 ret = ip_vs_mh_reassign(s, svc); 409 if (ret < 0) { 410 ip_vs_mh_reset(s); 411 ip_vs_mh_state_free(&s->rcu_head); 412 return ret; 413 } 414 415 /* No more failures, attach state */ 416 svc->sched_data = s; 417 return 0; 418 } 419 420 static void ip_vs_mh_done_svc(struct ip_vs_service *svc) 421 { 422 struct ip_vs_mh_state *s = svc->sched_data; 423 424 /* Got to clean up lookup entry here */ 425 ip_vs_mh_reset(s); 426 427 call_rcu(&s->rcu_head, ip_vs_mh_state_free); 428 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", 429 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); 430 } 431 432 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, 433 struct ip_vs_dest *dest) 434 { 435 struct ip_vs_mh_state *s = svc->sched_data; 436 437 s->gcd = ip_vs_mh_gcd_weight(svc); 438 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); 439 440 /* Assign the lookup table with the updated service */ 441 return ip_vs_mh_reassign(s, svc); 442 } 443 444 /* Helper function to get port number */ 445 static inline __be16 446 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) 447 { 448 __be16 _ports[2], *ports; 449 450 /* At this point we know that we have a valid packet of some kind. 451 * Because ICMP packets are only guaranteed to have the first 8 452 * bytes, let's just grab the ports. Fortunately they're in the 453 * same position for all three of the protocols we care about. 454 */ 455 switch (iph->protocol) { 456 case IPPROTO_TCP: 457 case IPPROTO_UDP: 458 case IPPROTO_SCTP: 459 ports = skb_header_pointer(skb, iph->len, sizeof(_ports), 460 &_ports); 461 if (unlikely(!ports)) 462 return 0; 463 464 if (likely(!ip_vs_iph_inverse(iph))) 465 return ports[0]; 466 else 467 return ports[1]; 468 default: 469 return 0; 470 } 471 } 472 473 /* Maglev Hashing scheduling */ 474 static struct ip_vs_dest * 475 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, 476 struct ip_vs_iphdr *iph) 477 { 478 struct ip_vs_dest *dest; 479 struct ip_vs_mh_state *s; 480 __be16 port = 0; 481 const union nf_inet_addr *hash_addr; 482 483 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; 484 485 IP_VS_DBG(6, "%s : Scheduling...\n", __func__); 486 487 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) 488 port = ip_vs_mh_get_port(skb, iph); 489 490 s = (struct ip_vs_mh_state *)svc->sched_data; 491 492 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) 493 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); 494 else 495 dest = ip_vs_mh_get(svc, s, hash_addr, port); 496 497 if (!dest) { 498 ip_vs_scheduler_err(svc, "no destination available"); 499 return NULL; 500 } 501 502 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", 503 IP_VS_DBG_ADDR(svc->af, hash_addr), 504 ntohs(port), 505 IP_VS_DBG_ADDR(dest->af, &dest->addr), 506 ntohs(dest->port)); 507 508 return dest; 509 } 510 511 /* IPVS MH Scheduler structure */ 512 static struct ip_vs_scheduler ip_vs_mh_scheduler = { 513 .name = "mh", 514 .refcnt = ATOMIC_INIT(0), 515 .module = THIS_MODULE, 516 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), 517 .init_service = ip_vs_mh_init_svc, 518 .done_service = ip_vs_mh_done_svc, 519 .add_dest = ip_vs_mh_dest_changed, 520 .del_dest = ip_vs_mh_dest_changed, 521 .upd_dest = ip_vs_mh_dest_changed, 522 .schedule = ip_vs_mh_schedule, 523 }; 524 525 static int __init ip_vs_mh_init(void) 526 { 527 return register_ip_vs_scheduler(&ip_vs_mh_scheduler); 528 } 529 530 static void __exit ip_vs_mh_cleanup(void) 531 { 532 unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); 533 rcu_barrier(); 534 } 535 536 module_init(ip_vs_mh_init); 537 module_exit(ip_vs_mh_cleanup); 538 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); 539 MODULE_LICENSE("GPL v2"); 540 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>"); 541