1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS: Weighted Round-Robin Scheduling module 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * 7 * Changes: 8 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest 9 * Wensong Zhang : changed some comestics things for debugging 10 * Wensong Zhang : changed for the d-linked destination list 11 * Wensong Zhang : added the ip_vs_wrr_update_svc 12 * Julian Anastasov : fixed the bug of returning destination 13 * with weight 0 when all weights are zero 14 */ 15 16 #define KMSG_COMPONENT "IPVS" 17 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 18 19 #include <linux/module.h> 20 #include <linux/kernel.h> 21 #include <linux/slab.h> 22 #include <linux/net.h> 23 #include <linux/gcd.h> 24 25 #include <net/ip_vs.h> 26 27 /* The WRR algorithm depends on some caclulations: 28 * - mw: maximum weight 29 * - di: weight step, greatest common divisor from all weights 30 * - cw: current required weight 31 * As result, all weights are in the [di..mw] range with a step=di. 32 * 33 * First, we start with cw = mw and select dests with weight >= cw. 34 * Then cw is reduced with di and all dests are checked again. 35 * Last pass should be with cw = di. We have mw/di passes in total: 36 * 37 * pass 1: cw = max weight 38 * pass 2: cw = max weight - di 39 * pass 3: cw = max weight - 2 * di 40 * ... 41 * last pass: cw = di 42 * 43 * Weights are supposed to be >= di but we run in parallel with 44 * weight changes, it is possible some dest weight to be reduced 45 * below di, bad if it is the only available dest. 46 * 47 * So, we modify how mw is calculated, now it is reduced with (di - 1), 48 * so that last cw is 1 to catch such dests with weight below di: 49 * pass 1: cw = max weight - (di - 1) 50 * pass 2: cw = max weight - di - (di - 1) 51 * pass 3: cw = max weight - 2 * di - (di - 1) 52 * ... 53 * last pass: cw = 1 54 * 55 */ 56 57 /* 58 * current destination pointer for weighted round-robin scheduling 59 */ 60 struct ip_vs_wrr_mark { 61 struct ip_vs_dest *cl; /* current dest or head */ 62 int cw; /* current weight */ 63 int mw; /* maximum weight */ 64 int di; /* decreasing interval */ 65 struct rcu_head rcu_head; 66 }; 67 68 69 static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) 70 { 71 struct ip_vs_dest *dest; 72 int weight; 73 int g = 0; 74 75 list_for_each_entry(dest, &svc->destinations, n_list) { 76 weight = atomic_read(&dest->weight); 77 if (weight > 0) { 78 if (g > 0) 79 g = gcd(weight, g); 80 else 81 g = weight; 82 } 83 } 84 return g ? g : 1; 85 } 86 87 88 /* 89 * Get the maximum weight of the service destinations. 90 */ 91 static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) 92 { 93 struct ip_vs_dest *dest; 94 int new_weight, weight = 0; 95 96 list_for_each_entry(dest, &svc->destinations, n_list) { 97 new_weight = atomic_read(&dest->weight); 98 if (new_weight > weight) 99 weight = new_weight; 100 } 101 102 return weight; 103 } 104 105 106 static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) 107 { 108 struct ip_vs_wrr_mark *mark; 109 110 /* 111 * Allocate the mark variable for WRR scheduling 112 */ 113 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); 114 if (mark == NULL) 115 return -ENOMEM; 116 117 mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); 118 mark->di = ip_vs_wrr_gcd_weight(svc); 119 mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); 120 mark->cw = mark->mw; 121 svc->sched_data = mark; 122 123 return 0; 124 } 125 126 127 static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) 128 { 129 struct ip_vs_wrr_mark *mark = svc->sched_data; 130 131 /* 132 * Release the mark variable 133 */ 134 kfree_rcu(mark, rcu_head); 135 } 136 137 138 static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, 139 struct ip_vs_dest *dest) 140 { 141 struct ip_vs_wrr_mark *mark = svc->sched_data; 142 143 spin_lock_bh(&svc->sched_lock); 144 mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); 145 mark->di = ip_vs_wrr_gcd_weight(svc); 146 mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); 147 if (mark->cw > mark->mw || !mark->cw) 148 mark->cw = mark->mw; 149 else if (mark->di > 1) 150 mark->cw = (mark->cw / mark->di) * mark->di + 1; 151 spin_unlock_bh(&svc->sched_lock); 152 return 0; 153 } 154 155 156 /* 157 * Weighted Round-Robin Scheduling 158 */ 159 static struct ip_vs_dest * 160 ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, 161 struct ip_vs_iphdr *iph) 162 { 163 struct ip_vs_dest *dest, *last, *stop = NULL; 164 struct ip_vs_wrr_mark *mark = svc->sched_data; 165 bool last_pass = false, restarted = false; 166 167 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 168 169 spin_lock_bh(&svc->sched_lock); 170 dest = mark->cl; 171 /* No available dests? */ 172 if (mark->mw == 0) 173 goto err_noavail; 174 last = dest; 175 /* Stop only after all dests were checked for weight >= 1 (last pass) */ 176 while (1) { 177 list_for_each_entry_continue_rcu(dest, 178 &svc->destinations, 179 n_list) { 180 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && 181 atomic_read(&dest->weight) >= mark->cw) 182 goto found; 183 if (dest == stop) 184 goto err_over; 185 } 186 mark->cw -= mark->di; 187 if (mark->cw <= 0) { 188 mark->cw = mark->mw; 189 /* Stop if we tried last pass from first dest: 190 * 1. last_pass: we started checks when cw > di but 191 * then all dests were checked for w >= 1 192 * 2. last was head: the first and only traversal 193 * was for weight >= 1, for all dests. 194 */ 195 if (last_pass || 196 &last->n_list == &svc->destinations) 197 goto err_over; 198 restarted = true; 199 } 200 last_pass = mark->cw <= mark->di; 201 if (last_pass && restarted && 202 &last->n_list != &svc->destinations) { 203 /* First traversal was for w >= 1 but only 204 * for dests after 'last', now do the same 205 * for all dests up to 'last'. 206 */ 207 stop = last; 208 } 209 } 210 211 found: 212 IP_VS_DBG_BUF(6, "WRR: server %s:%u " 213 "activeconns %d refcnt %d weight %d\n", 214 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 215 atomic_read(&dest->activeconns), 216 refcount_read(&dest->refcnt), 217 atomic_read(&dest->weight)); 218 mark->cl = dest; 219 220 out: 221 spin_unlock_bh(&svc->sched_lock); 222 return dest; 223 224 err_noavail: 225 mark->cl = dest; 226 dest = NULL; 227 ip_vs_scheduler_err(svc, "no destination available"); 228 goto out; 229 230 err_over: 231 mark->cl = dest; 232 dest = NULL; 233 ip_vs_scheduler_err(svc, "no destination available: " 234 "all destinations are overloaded"); 235 goto out; 236 } 237 238 239 static struct ip_vs_scheduler ip_vs_wrr_scheduler = { 240 .name = "wrr", 241 .refcnt = ATOMIC_INIT(0), 242 .module = THIS_MODULE, 243 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), 244 .init_service = ip_vs_wrr_init_svc, 245 .done_service = ip_vs_wrr_done_svc, 246 .add_dest = ip_vs_wrr_dest_changed, 247 .del_dest = ip_vs_wrr_dest_changed, 248 .upd_dest = ip_vs_wrr_dest_changed, 249 .schedule = ip_vs_wrr_schedule, 250 }; 251 252 static int __init ip_vs_wrr_init(void) 253 { 254 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; 255 } 256 257 static void __exit ip_vs_wrr_cleanup(void) 258 { 259 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); 260 synchronize_rcu(); 261 } 262 263 module_init(ip_vs_wrr_init); 264 module_exit(ip_vs_wrr_cleanup); 265 MODULE_LICENSE("GPL"); 266