12874c5fdSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
2cb7f6a7bSJulius Volz /*
3cb7f6a7bSJulius Volz * IPVS: Shortest Expected Delay scheduling module
4cb7f6a7bSJulius Volz *
5cb7f6a7bSJulius Volz * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6cb7f6a7bSJulius Volz *
7cb7f6a7bSJulius Volz * Changes:
8cb7f6a7bSJulius Volz */
9cb7f6a7bSJulius Volz
10cb7f6a7bSJulius Volz /*
11cb7f6a7bSJulius Volz * The SED algorithm attempts to minimize each job's expected delay until
12cb7f6a7bSJulius Volz * completion. The expected delay that the job will experience is
13cb7f6a7bSJulius Volz * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
14cb7f6a7bSJulius Volz * jobs on the ith server and Ui is the fixed service rate (weight) of
15cb7f6a7bSJulius Volz * the ith server. The SED algorithm adopts a greedy policy that each does
16cb7f6a7bSJulius Volz * what is in its own best interest, i.e. to join the queue which would
17cb7f6a7bSJulius Volz * minimize its expected delay of completion.
18cb7f6a7bSJulius Volz *
19cb7f6a7bSJulius Volz * See the following paper for more information:
20cb7f6a7bSJulius Volz * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
21cb7f6a7bSJulius Volz * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
22cb7f6a7bSJulius Volz * pages 986-994, 1988.
23cb7f6a7bSJulius Volz *
24cb7f6a7bSJulius Volz * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
25cb7f6a7bSJulius Volz *
26cb7f6a7bSJulius Volz * The difference between SED and WLC is that SED includes the incoming
27cb7f6a7bSJulius Volz * job in the cost function (the increment of 1). SED may outperform
28cb7f6a7bSJulius Volz * WLC, while scheduling big jobs under larger heterogeneous systems
29cb7f6a7bSJulius Volz * (the server weight varies a lot).
30cb7f6a7bSJulius Volz *
31cb7f6a7bSJulius Volz */
32cb7f6a7bSJulius Volz
339aada7acSHannes Eder #define KMSG_COMPONENT "IPVS"
349aada7acSHannes Eder #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
359aada7acSHannes Eder
36cb7f6a7bSJulius Volz #include <linux/module.h>
37cb7f6a7bSJulius Volz #include <linux/kernel.h>
38cb7f6a7bSJulius Volz
39cb7f6a7bSJulius Volz #include <net/ip_vs.h>
40cb7f6a7bSJulius Volz
41cb7f6a7bSJulius Volz
42c16526a7SSimon Kirby static inline int
ip_vs_sed_dest_overhead(struct ip_vs_dest * dest)43cb7f6a7bSJulius Volz ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
44cb7f6a7bSJulius Volz {
45cb7f6a7bSJulius Volz /*
46cb7f6a7bSJulius Volz * We only use the active connection number in the cost
47cb7f6a7bSJulius Volz * calculation here.
48cb7f6a7bSJulius Volz */
49cb7f6a7bSJulius Volz return atomic_read(&dest->activeconns) + 1;
50cb7f6a7bSJulius Volz }
51cb7f6a7bSJulius Volz
52cb7f6a7bSJulius Volz
53cb7f6a7bSJulius Volz /*
54cb7f6a7bSJulius Volz * Weighted Least Connection scheduling
55cb7f6a7bSJulius Volz */
56cb7f6a7bSJulius Volz static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service * svc,const struct sk_buff * skb,struct ip_vs_iphdr * iph)57bba54de5SJulian Anastasov ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
58bba54de5SJulian Anastasov struct ip_vs_iphdr *iph)
59cb7f6a7bSJulius Volz {
60cb7f6a7bSJulius Volz struct ip_vs_dest *dest, *least;
61c16526a7SSimon Kirby int loh, doh;
62cb7f6a7bSJulius Volz
631e3e238eSHannes Eder IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
64cb7f6a7bSJulius Volz
65cb7f6a7bSJulius Volz /*
66cb7f6a7bSJulius Volz * We calculate the load of each dest server as follows:
67cb7f6a7bSJulius Volz * (server expected overhead) / dest->weight
68cb7f6a7bSJulius Volz *
69cb7f6a7bSJulius Volz * Remember -- no floats in kernel mode!!!
70cb7f6a7bSJulius Volz * The comparison of h1*w2 > h2*w1 is equivalent to that of
71cb7f6a7bSJulius Volz * h1/w1 > h2/w2
72cb7f6a7bSJulius Volz * if every weight is larger than zero.
73cb7f6a7bSJulius Volz *
74cb7f6a7bSJulius Volz * The server with weight=0 is quiesced and will not receive any
75cb7f6a7bSJulius Volz * new connections.
76cb7f6a7bSJulius Volz */
77cb7f6a7bSJulius Volz
789be52abaSJulian Anastasov list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
79cb7f6a7bSJulius Volz if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
80cb7f6a7bSJulius Volz atomic_read(&dest->weight) > 0) {
81cb7f6a7bSJulius Volz least = dest;
82cb7f6a7bSJulius Volz loh = ip_vs_sed_dest_overhead(least);
83cb7f6a7bSJulius Volz goto nextstage;
84cb7f6a7bSJulius Volz }
85cb7f6a7bSJulius Volz }
8641ac51eeSPatrick Schaaf ip_vs_scheduler_err(svc, "no destination available");
87cb7f6a7bSJulius Volz return NULL;
88cb7f6a7bSJulius Volz
89cb7f6a7bSJulius Volz /*
90cb7f6a7bSJulius Volz * Find the destination with the least load.
91cb7f6a7bSJulius Volz */
92cb7f6a7bSJulius Volz nextstage:
939be52abaSJulian Anastasov list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
94cb7f6a7bSJulius Volz if (dest->flags & IP_VS_DEST_F_OVERLOAD)
95cb7f6a7bSJulius Volz continue;
96cb7f6a7bSJulius Volz doh = ip_vs_sed_dest_overhead(dest);
97c16526a7SSimon Kirby if ((__s64)loh * atomic_read(&dest->weight) >
98c16526a7SSimon Kirby (__s64)doh * atomic_read(&least->weight)) {
99cb7f6a7bSJulius Volz least = dest;
100cb7f6a7bSJulius Volz loh = doh;
101cb7f6a7bSJulius Volz }
102cb7f6a7bSJulius Volz }
103cb7f6a7bSJulius Volz
104cb7f6a7bSJulius Volz IP_VS_DBG_BUF(6, "SED: server %s:%u "
105cb7f6a7bSJulius Volz "activeconns %d refcnt %d weight %d overhead %d\n",
1064d316f3fSJulian Anastasov IP_VS_DBG_ADDR(least->af, &least->addr),
1074d316f3fSJulian Anastasov ntohs(least->port),
108cb7f6a7bSJulius Volz atomic_read(&least->activeconns),
109b54ab92bSReshetova, Elena refcount_read(&least->refcnt),
110cb7f6a7bSJulius Volz atomic_read(&least->weight), loh);
111cb7f6a7bSJulius Volz
112cb7f6a7bSJulius Volz return least;
113cb7f6a7bSJulius Volz }
114cb7f6a7bSJulius Volz
115cb7f6a7bSJulius Volz
116cb7f6a7bSJulius Volz static struct ip_vs_scheduler ip_vs_sed_scheduler =
117cb7f6a7bSJulius Volz {
118cb7f6a7bSJulius Volz .name = "sed",
119cb7f6a7bSJulius Volz .refcnt = ATOMIC_INIT(0),
120cb7f6a7bSJulius Volz .module = THIS_MODULE,
121cb7f6a7bSJulius Volz .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
122cb7f6a7bSJulius Volz .schedule = ip_vs_sed_schedule,
123cb7f6a7bSJulius Volz };
124cb7f6a7bSJulius Volz
125cb7f6a7bSJulius Volz
ip_vs_sed_init(void)126cb7f6a7bSJulius Volz static int __init ip_vs_sed_init(void)
127cb7f6a7bSJulius Volz {
128cb7f6a7bSJulius Volz return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
129cb7f6a7bSJulius Volz }
130cb7f6a7bSJulius Volz
ip_vs_sed_cleanup(void)131cb7f6a7bSJulius Volz static void __exit ip_vs_sed_cleanup(void)
132cb7f6a7bSJulius Volz {
133cb7f6a7bSJulius Volz unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
134ceec4c38SJulian Anastasov synchronize_rcu();
135cb7f6a7bSJulius Volz }
136cb7f6a7bSJulius Volz
137cb7f6a7bSJulius Volz module_init(ip_vs_sed_init);
138cb7f6a7bSJulius Volz module_exit(ip_vs_sed_cleanup);
139cb7f6a7bSJulius Volz MODULE_LICENSE("GPL");
140