xref: /openbmc/linux/net/netfilter/ipvs/ip_vs_est.c (revision e4d0fe71)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_est.c: simple rate estimator for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *
7  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
8  *              Network name space (netns) aware.
9  *              Global data moved to netns i.e struct netns_ipvs
10  *              Affected data: est_list and est_lock.
11  *              estimation_timer() runs with timer per netns.
12  *              get_stats()) do the per cpu summing.
13  */
14 
15 #define KMSG_COMPONENT "IPVS"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17 
18 #include <linux/kernel.h>
19 #include <linux/jiffies.h>
20 #include <linux/types.h>
21 #include <linux/interrupt.h>
22 #include <linux/sysctl.h>
23 #include <linux/list.h>
24 
25 #include <net/ip_vs.h>
26 
27 /*
28   This code is to estimate rate in a shorter interval (such as 8
29   seconds) for virtual services and real servers. For measure rate in a
30   long interval, it is easy to implement a user level daemon which
31   periodically reads those statistical counters and measure rate.
32 
33   We measure rate during the last 8 seconds every 2 seconds:
34 
35     avgrate = avgrate*(1-W) + rate*W
36 
37     where W = 2^(-2)
38 
39   NOTES.
40 
41   * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
42 
43   * Netlink users can see 64-bit values but sockopt users are restricted
44     to 32-bit values for conns, packets, bps, cps and pps.
45 
46   * A lot of code is taken from net/core/gen_estimator.c
47 
48   KEY POINTS:
49   - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
50   - kthreads read the cpustats to update the estimators (svcs, dests, total)
51   - the states of estimators can be read (get stats) or modified (zero stats)
52     from processes
53 
54   KTHREADS:
55   - estimators are added initially to est_temp_list and later kthread 0
56     distributes them to one or many kthreads for estimation
57   - kthread contexts are created and attached to array
58   - the kthread tasks are started when first service is added, before that
59     the total stats are not estimated
60   - when configuration (cpulist/nice) is changed, the tasks are restarted
61     by work (est_reload_work)
62   - kthread tasks are stopped while the cpulist is empty
63   - the kthread context holds lists with estimators (chains) which are
64     processed every 2 seconds
65   - as estimators can be added dynamically and in bursts, we try to spread
66     them to multiple chains which are estimated at different time
67   - on start, kthread 0 enters calculation phase to determine the chain limits
68     and the limit of estimators per kthread
69   - est_add_ktid: ktid where to add new ests, can point to empty slot where
70     we should add kt data
71  */
72 
73 static struct lock_class_key __ipvs_est_key;
74 
75 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
76 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
77 
ip_vs_chain_estimation(struct hlist_head * chain)78 static void ip_vs_chain_estimation(struct hlist_head *chain)
79 {
80 	struct ip_vs_estimator *e;
81 	struct ip_vs_cpu_stats *c;
82 	struct ip_vs_stats *s;
83 	u64 rate;
84 
85 	hlist_for_each_entry_rcu(e, chain, list) {
86 		u64 conns, inpkts, outpkts, inbytes, outbytes;
87 		u64 kconns = 0, kinpkts = 0, koutpkts = 0;
88 		u64 kinbytes = 0, koutbytes = 0;
89 		unsigned int start;
90 		int i;
91 
92 		if (kthread_should_stop())
93 			break;
94 
95 		s = container_of(e, struct ip_vs_stats, est);
96 		for_each_possible_cpu(i) {
97 			c = per_cpu_ptr(s->cpustats, i);
98 			do {
99 				start = u64_stats_fetch_begin(&c->syncp);
100 				conns = u64_stats_read(&c->cnt.conns);
101 				inpkts = u64_stats_read(&c->cnt.inpkts);
102 				outpkts = u64_stats_read(&c->cnt.outpkts);
103 				inbytes = u64_stats_read(&c->cnt.inbytes);
104 				outbytes = u64_stats_read(&c->cnt.outbytes);
105 			} while (u64_stats_fetch_retry(&c->syncp, start));
106 			kconns += conns;
107 			kinpkts += inpkts;
108 			koutpkts += outpkts;
109 			kinbytes += inbytes;
110 			koutbytes += outbytes;
111 		}
112 
113 		spin_lock(&s->lock);
114 
115 		s->kstats.conns = kconns;
116 		s->kstats.inpkts = kinpkts;
117 		s->kstats.outpkts = koutpkts;
118 		s->kstats.inbytes = kinbytes;
119 		s->kstats.outbytes = koutbytes;
120 
121 		/* scaled by 2^10, but divided 2 seconds */
122 		rate = (s->kstats.conns - e->last_conns) << 9;
123 		e->last_conns = s->kstats.conns;
124 		e->cps += ((s64)rate - (s64)e->cps) >> 2;
125 
126 		rate = (s->kstats.inpkts - e->last_inpkts) << 9;
127 		e->last_inpkts = s->kstats.inpkts;
128 		e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
129 
130 		rate = (s->kstats.outpkts - e->last_outpkts) << 9;
131 		e->last_outpkts = s->kstats.outpkts;
132 		e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
133 
134 		/* scaled by 2^5, but divided 2 seconds */
135 		rate = (s->kstats.inbytes - e->last_inbytes) << 4;
136 		e->last_inbytes = s->kstats.inbytes;
137 		e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
138 
139 		rate = (s->kstats.outbytes - e->last_outbytes) << 4;
140 		e->last_outbytes = s->kstats.outbytes;
141 		e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
142 		spin_unlock(&s->lock);
143 	}
144 }
145 
ip_vs_tick_estimation(struct ip_vs_est_kt_data * kd,int row)146 static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
147 {
148 	struct ip_vs_est_tick_data *td;
149 	int cid;
150 
151 	rcu_read_lock();
152 	td = rcu_dereference(kd->ticks[row]);
153 	if (!td)
154 		goto out;
155 	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
156 		if (kthread_should_stop())
157 			break;
158 		ip_vs_chain_estimation(&td->chains[cid]);
159 		cond_resched_rcu();
160 		td = rcu_dereference(kd->ticks[row]);
161 		if (!td)
162 			break;
163 	}
164 
165 out:
166 	rcu_read_unlock();
167 }
168 
ip_vs_estimation_kthread(void * data)169 static int ip_vs_estimation_kthread(void *data)
170 {
171 	struct ip_vs_est_kt_data *kd = data;
172 	struct netns_ipvs *ipvs = kd->ipvs;
173 	int row = kd->est_row;
174 	unsigned long now;
175 	int id = kd->id;
176 	long gap;
177 
178 	if (id > 0) {
179 		if (!ipvs->est_chain_max)
180 			return 0;
181 	} else {
182 		if (!ipvs->est_chain_max) {
183 			ipvs->est_calc_phase = 1;
184 			/* commit est_calc_phase before reading est_genid */
185 			smp_mb();
186 		}
187 
188 		/* kthread 0 will handle the calc phase */
189 		if (ipvs->est_calc_phase)
190 			ip_vs_est_calc_phase(ipvs);
191 	}
192 
193 	while (1) {
194 		if (!id && !hlist_empty(&ipvs->est_temp_list))
195 			ip_vs_est_drain_temp_list(ipvs);
196 		set_current_state(TASK_IDLE);
197 		if (kthread_should_stop())
198 			break;
199 
200 		/* before estimation, check if we should sleep */
201 		now = jiffies;
202 		gap = kd->est_timer - now;
203 		if (gap > 0) {
204 			if (gap > IPVS_EST_TICK) {
205 				kd->est_timer = now - IPVS_EST_TICK;
206 				gap = IPVS_EST_TICK;
207 			}
208 			schedule_timeout(gap);
209 		} else {
210 			__set_current_state(TASK_RUNNING);
211 			if (gap < -8 * IPVS_EST_TICK)
212 				kd->est_timer = now;
213 		}
214 
215 		if (kd->tick_len[row])
216 			ip_vs_tick_estimation(kd, row);
217 
218 		row++;
219 		if (row >= IPVS_EST_NTICKS)
220 			row = 0;
221 		WRITE_ONCE(kd->est_row, row);
222 		kd->est_timer += IPVS_EST_TICK;
223 	}
224 	__set_current_state(TASK_RUNNING);
225 
226 	return 0;
227 }
228 
229 /* Schedule stop/start for kthread tasks */
ip_vs_est_reload_start(struct netns_ipvs * ipvs)230 void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
231 {
232 	/* Ignore reloads before first service is added */
233 	if (!ipvs->enable)
234 		return;
235 	ip_vs_est_stopped_recalc(ipvs);
236 	/* Bump the kthread configuration genid */
237 	atomic_inc(&ipvs->est_genid);
238 	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
239 }
240 
241 /* Start kthread task with current configuration */
ip_vs_est_kthread_start(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)242 int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
243 			    struct ip_vs_est_kt_data *kd)
244 {
245 	unsigned long now;
246 	int ret = 0;
247 	long gap;
248 
249 	lockdep_assert_held(&ipvs->est_mutex);
250 
251 	if (kd->task)
252 		goto out;
253 	now = jiffies;
254 	gap = kd->est_timer - now;
255 	/* Sync est_timer if task is starting later */
256 	if (abs(gap) > 4 * IPVS_EST_TICK)
257 		kd->est_timer = now;
258 	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
259 				  ipvs->gen, kd->id);
260 	if (IS_ERR(kd->task)) {
261 		ret = PTR_ERR(kd->task);
262 		kd->task = NULL;
263 		goto out;
264 	}
265 
266 	set_user_nice(kd->task, sysctl_est_nice(ipvs));
267 	set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
268 
269 	pr_info("starting estimator thread %d...\n", kd->id);
270 	wake_up_process(kd->task);
271 
272 out:
273 	return ret;
274 }
275 
ip_vs_est_kthread_stop(struct ip_vs_est_kt_data * kd)276 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
277 {
278 	if (kd->task) {
279 		pr_info("stopping estimator thread %d...\n", kd->id);
280 		kthread_stop(kd->task);
281 		kd->task = NULL;
282 	}
283 }
284 
285 /* Apply parameters to kthread */
ip_vs_est_set_params(struct netns_ipvs * ipvs,struct ip_vs_est_kt_data * kd)286 static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
287 				 struct ip_vs_est_kt_data *kd)
288 {
289 	kd->chain_max = ipvs->est_chain_max;
290 	/* We are using single chain on RCU preemption */
291 	if (IPVS_EST_TICK_CHAINS == 1)
292 		kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
293 	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
294 	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
295 }
296 
297 /* Create and start estimation kthread in a free or new array slot */
ip_vs_est_add_kthread(struct netns_ipvs * ipvs)298 static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
299 {
300 	struct ip_vs_est_kt_data *kd = NULL;
301 	int id = ipvs->est_kt_count;
302 	int ret = -ENOMEM;
303 	void *arr = NULL;
304 	int i;
305 
306 	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
307 	    ipvs->enable && ipvs->est_max_threads)
308 		return -EINVAL;
309 
310 	mutex_lock(&ipvs->est_mutex);
311 
312 	for (i = 0; i < id; i++) {
313 		if (!ipvs->est_kt_arr[i])
314 			break;
315 	}
316 	if (i >= id) {
317 		arr = krealloc_array(ipvs->est_kt_arr, id + 1,
318 				     sizeof(struct ip_vs_est_kt_data *),
319 				     GFP_KERNEL);
320 		if (!arr)
321 			goto out;
322 		ipvs->est_kt_arr = arr;
323 	} else {
324 		id = i;
325 	}
326 
327 	kd = kzalloc(sizeof(*kd), GFP_KERNEL);
328 	if (!kd)
329 		goto out;
330 	kd->ipvs = ipvs;
331 	bitmap_fill(kd->avail, IPVS_EST_NTICKS);
332 	kd->est_timer = jiffies;
333 	kd->id = id;
334 	ip_vs_est_set_params(ipvs, kd);
335 
336 	/* Pre-allocate stats used in calc phase */
337 	if (!id && !kd->calc_stats) {
338 		kd->calc_stats = ip_vs_stats_alloc();
339 		if (!kd->calc_stats)
340 			goto out;
341 	}
342 
343 	/* Start kthread tasks only when services are present */
344 	if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
345 		ret = ip_vs_est_kthread_start(ipvs, kd);
346 		if (ret < 0)
347 			goto out;
348 	}
349 
350 	if (arr)
351 		ipvs->est_kt_count++;
352 	ipvs->est_kt_arr[id] = kd;
353 	kd = NULL;
354 	/* Use most recent kthread for new ests */
355 	ipvs->est_add_ktid = id;
356 	ret = 0;
357 
358 out:
359 	mutex_unlock(&ipvs->est_mutex);
360 	if (kd) {
361 		ip_vs_stats_free(kd->calc_stats);
362 		kfree(kd);
363 	}
364 
365 	return ret;
366 }
367 
368 /* Select ktid where to add new ests: available, unused or new slot */
ip_vs_est_update_ktid(struct netns_ipvs * ipvs)369 static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
370 {
371 	int ktid, best = ipvs->est_kt_count;
372 	struct ip_vs_est_kt_data *kd;
373 
374 	for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
375 		kd = ipvs->est_kt_arr[ktid];
376 		if (kd) {
377 			if (kd->est_count < kd->est_max_count) {
378 				best = ktid;
379 				break;
380 			}
381 		} else if (ktid < best) {
382 			best = ktid;
383 		}
384 	}
385 	ipvs->est_add_ktid = best;
386 }
387 
388 /* Add estimator to current kthread (est_add_ktid) */
ip_vs_enqueue_estimator(struct netns_ipvs * ipvs,struct ip_vs_estimator * est)389 static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
390 				   struct ip_vs_estimator *est)
391 {
392 	struct ip_vs_est_kt_data *kd = NULL;
393 	struct ip_vs_est_tick_data *td;
394 	int ktid, row, crow, cid, ret;
395 	int delay = est->ktrow;
396 
397 	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
398 			 "Too many chains for ktcid");
399 
400 	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
401 		kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
402 		if (kd)
403 			goto add_est;
404 	}
405 
406 	ret = ip_vs_est_add_kthread(ipvs);
407 	if (ret < 0)
408 		goto out;
409 	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
410 
411 add_est:
412 	ktid = kd->id;
413 	/* For small number of estimators prefer to use few ticks,
414 	 * otherwise try to add into the last estimated row.
415 	 * est_row and add_row point after the row we should use
416 	 */
417 	if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
418 		crow = READ_ONCE(kd->est_row);
419 	else
420 		crow = kd->add_row;
421 	crow += delay;
422 	if (crow >= IPVS_EST_NTICKS)
423 		crow -= IPVS_EST_NTICKS;
424 	/* Assume initial delay ? */
425 	if (delay >= IPVS_EST_NTICKS - 1) {
426 		/* Preserve initial delay or decrease it if no space in tick */
427 		row = crow;
428 		if (crow < IPVS_EST_NTICKS - 1) {
429 			crow++;
430 			row = find_last_bit(kd->avail, crow);
431 		}
432 		if (row >= crow)
433 			row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
434 	} else {
435 		/* Preserve delay or increase it if no space in tick */
436 		row = IPVS_EST_NTICKS;
437 		if (crow > 0)
438 			row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
439 		if (row >= IPVS_EST_NTICKS)
440 			row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
441 	}
442 
443 	td = rcu_dereference_protected(kd->ticks[row], 1);
444 	if (!td) {
445 		td = kzalloc(sizeof(*td), GFP_KERNEL);
446 		if (!td) {
447 			ret = -ENOMEM;
448 			goto out;
449 		}
450 		rcu_assign_pointer(kd->ticks[row], td);
451 	}
452 
453 	cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
454 
455 	kd->est_count++;
456 	kd->tick_len[row]++;
457 	if (!td->chain_len[cid])
458 		__set_bit(cid, td->present);
459 	td->chain_len[cid]++;
460 	est->ktid = ktid;
461 	est->ktrow = row;
462 	est->ktcid = cid;
463 	hlist_add_head_rcu(&est->list, &td->chains[cid]);
464 
465 	if (td->chain_len[cid] >= kd->chain_max) {
466 		__set_bit(cid, td->full);
467 		if (kd->tick_len[row] >= kd->tick_max)
468 			__clear_bit(row, kd->avail);
469 	}
470 
471 	/* Update est_add_ktid to point to first available/empty kt slot */
472 	if (kd->est_count == kd->est_max_count)
473 		ip_vs_est_update_ktid(ipvs);
474 
475 	ret = 0;
476 
477 out:
478 	return ret;
479 }
480 
481 /* Start estimation for stats */
ip_vs_start_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)482 int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
483 {
484 	struct ip_vs_estimator *est = &stats->est;
485 	int ret;
486 
487 	if (!ipvs->est_max_threads && ipvs->enable)
488 		ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
489 
490 	est->ktid = -1;
491 	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
492 
493 	/* We prefer this code to be short, kthread 0 will requeue the
494 	 * estimator to available chain. If tasks are disabled, we
495 	 * will not allocate much memory, just for kt 0.
496 	 */
497 	ret = 0;
498 	if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
499 		ret = ip_vs_est_add_kthread(ipvs);
500 	if (ret >= 0)
501 		hlist_add_head(&est->list, &ipvs->est_temp_list);
502 	else
503 		INIT_HLIST_NODE(&est->list);
504 	return ret;
505 }
506 
ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data * kd)507 static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
508 {
509 	if (kd) {
510 		if (kd->task) {
511 			pr_info("stop unused estimator thread %d...\n", kd->id);
512 			kthread_stop(kd->task);
513 		}
514 		ip_vs_stats_free(kd->calc_stats);
515 		kfree(kd);
516 	}
517 }
518 
519 /* Unlink estimator from chain */
ip_vs_stop_estimator(struct netns_ipvs * ipvs,struct ip_vs_stats * stats)520 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
521 {
522 	struct ip_vs_estimator *est = &stats->est;
523 	struct ip_vs_est_tick_data *td;
524 	struct ip_vs_est_kt_data *kd;
525 	int ktid = est->ktid;
526 	int row = est->ktrow;
527 	int cid = est->ktcid;
528 
529 	/* Failed to add to chain ? */
530 	if (hlist_unhashed(&est->list))
531 		return;
532 
533 	/* On return, estimator can be freed, dequeue it now */
534 
535 	/* In est_temp_list ? */
536 	if (ktid < 0) {
537 		hlist_del(&est->list);
538 		goto end_kt0;
539 	}
540 
541 	hlist_del_rcu(&est->list);
542 	kd = ipvs->est_kt_arr[ktid];
543 	td = rcu_dereference_protected(kd->ticks[row], 1);
544 	__clear_bit(cid, td->full);
545 	td->chain_len[cid]--;
546 	if (!td->chain_len[cid])
547 		__clear_bit(cid, td->present);
548 	kd->tick_len[row]--;
549 	__set_bit(row, kd->avail);
550 	if (!kd->tick_len[row]) {
551 		RCU_INIT_POINTER(kd->ticks[row], NULL);
552 		kfree_rcu(td, rcu_head);
553 	}
554 	kd->est_count--;
555 	if (kd->est_count) {
556 		/* This kt slot can become available just now, prefer it */
557 		if (ktid < ipvs->est_add_ktid)
558 			ipvs->est_add_ktid = ktid;
559 		return;
560 	}
561 
562 	if (ktid > 0) {
563 		mutex_lock(&ipvs->est_mutex);
564 		ip_vs_est_kthread_destroy(kd);
565 		ipvs->est_kt_arr[ktid] = NULL;
566 		if (ktid == ipvs->est_kt_count - 1) {
567 			ipvs->est_kt_count--;
568 			while (ipvs->est_kt_count > 1 &&
569 			       !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
570 				ipvs->est_kt_count--;
571 		}
572 		mutex_unlock(&ipvs->est_mutex);
573 
574 		/* This slot is now empty, prefer another available kt slot */
575 		if (ktid == ipvs->est_add_ktid)
576 			ip_vs_est_update_ktid(ipvs);
577 	}
578 
579 end_kt0:
580 	/* kt 0 is freed after all other kthreads and chains are empty */
581 	if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
582 		kd = ipvs->est_kt_arr[0];
583 		if (!kd || !kd->est_count) {
584 			mutex_lock(&ipvs->est_mutex);
585 			if (kd) {
586 				ip_vs_est_kthread_destroy(kd);
587 				ipvs->est_kt_arr[0] = NULL;
588 			}
589 			ipvs->est_kt_count--;
590 			mutex_unlock(&ipvs->est_mutex);
591 			ipvs->est_add_ktid = 0;
592 		}
593 	}
594 }
595 
596 /* Register all ests from est_temp_list to kthreads */
ip_vs_est_drain_temp_list(struct netns_ipvs * ipvs)597 static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
598 {
599 	struct ip_vs_estimator *est;
600 
601 	while (1) {
602 		int max = 16;
603 
604 		mutex_lock(&__ip_vs_mutex);
605 
606 		while (max-- > 0) {
607 			est = hlist_entry_safe(ipvs->est_temp_list.first,
608 					       struct ip_vs_estimator, list);
609 			if (est) {
610 				if (kthread_should_stop())
611 					goto unlock;
612 				hlist_del_init(&est->list);
613 				if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
614 					continue;
615 				est->ktid = -1;
616 				hlist_add_head(&est->list,
617 					       &ipvs->est_temp_list);
618 				/* Abort, some entries will not be estimated
619 				 * until next attempt
620 				 */
621 			}
622 			goto unlock;
623 		}
624 		mutex_unlock(&__ip_vs_mutex);
625 		cond_resched();
626 	}
627 
628 unlock:
629 	mutex_unlock(&__ip_vs_mutex);
630 }
631 
632 /* Calculate limits for all kthreads */
ip_vs_est_calc_limits(struct netns_ipvs * ipvs,int * chain_max)633 static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
634 {
635 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
636 	struct ip_vs_est_kt_data *kd;
637 	struct hlist_head chain;
638 	struct ip_vs_stats *s;
639 	int cache_factor = 4;
640 	int i, loops, ntest;
641 	s32 min_est = 0;
642 	ktime_t t1, t2;
643 	int max = 8;
644 	int ret = 1;
645 	s64 diff;
646 	u64 val;
647 
648 	INIT_HLIST_HEAD(&chain);
649 	mutex_lock(&__ip_vs_mutex);
650 	kd = ipvs->est_kt_arr[0];
651 	mutex_unlock(&__ip_vs_mutex);
652 	s = kd ? kd->calc_stats : NULL;
653 	if (!s)
654 		goto out;
655 	hlist_add_head(&s->est.list, &chain);
656 
657 	loops = 1;
658 	/* Get best result from many tests */
659 	for (ntest = 0; ntest < 12; ntest++) {
660 		if (!(ntest & 3)) {
661 			/* Wait for cpufreq frequency transition */
662 			wait_event_idle_timeout(wq, kthread_should_stop(),
663 						HZ / 50);
664 			if (!ipvs->enable || kthread_should_stop())
665 				goto stop;
666 		}
667 
668 		local_bh_disable();
669 		rcu_read_lock();
670 
671 		/* Put stats in cache */
672 		ip_vs_chain_estimation(&chain);
673 
674 		t1 = ktime_get();
675 		for (i = loops * cache_factor; i > 0; i--)
676 			ip_vs_chain_estimation(&chain);
677 		t2 = ktime_get();
678 
679 		rcu_read_unlock();
680 		local_bh_enable();
681 
682 		if (!ipvs->enable || kthread_should_stop())
683 			goto stop;
684 		cond_resched();
685 
686 		diff = ktime_to_ns(ktime_sub(t2, t1));
687 		if (diff <= 1 * NSEC_PER_USEC) {
688 			/* Do more loops on low time resolution */
689 			loops *= 2;
690 			continue;
691 		}
692 		if (diff >= NSEC_PER_SEC)
693 			continue;
694 		val = diff;
695 		do_div(val, loops);
696 		if (!min_est || val < min_est) {
697 			min_est = val;
698 			/* goal: 95usec per chain */
699 			val = 95 * NSEC_PER_USEC;
700 			if (val >= min_est) {
701 				do_div(val, min_est);
702 				max = (int)val;
703 			} else {
704 				max = 1;
705 			}
706 		}
707 	}
708 
709 out:
710 	if (s)
711 		hlist_del_init(&s->est.list);
712 	*chain_max = max;
713 	return ret;
714 
715 stop:
716 	ret = 0;
717 	goto out;
718 }
719 
720 /* Calculate the parameters and apply them in context of kt #0
721  * ECP: est_calc_phase
722  * ECM: est_chain_max
723  * ECP	ECM	Insert Chain	enable	Description
724  * ---------------------------------------------------------------------------
725  * 0	0	est_temp_list	0	create kt #0 context
726  * 0	0	est_temp_list	0->1	service added, start kthread #0 task
727  * 0->1	0	est_temp_list	1	kt task #0 started, enters calc phase
728  * 1	0	est_temp_list	1	kt #0: determine est_chain_max,
729  *					stop tasks, move ests to est_temp_list
730  *					and free kd for kthreads 1..last
731  * 1->0	0->N	kt chains	1	ests can go to kthreads
732  * 0	N	kt chains	1	drain est_temp_list, create new kthread
733  *					contexts, start tasks, estimate
734  */
ip_vs_est_calc_phase(struct netns_ipvs * ipvs)735 static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
736 {
737 	int genid = atomic_read(&ipvs->est_genid);
738 	struct ip_vs_est_tick_data *td;
739 	struct ip_vs_est_kt_data *kd;
740 	struct ip_vs_estimator *est;
741 	struct ip_vs_stats *stats;
742 	int id, row, cid, delay;
743 	bool last, last_td;
744 	int chain_max;
745 	int step;
746 
747 	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
748 		return;
749 
750 	mutex_lock(&__ip_vs_mutex);
751 
752 	/* Stop all other tasks, so that we can immediately move the
753 	 * estimators to est_temp_list without RCU grace period
754 	 */
755 	mutex_lock(&ipvs->est_mutex);
756 	for (id = 1; id < ipvs->est_kt_count; id++) {
757 		/* netns clean up started, abort */
758 		if (!ipvs->enable)
759 			goto unlock2;
760 		kd = ipvs->est_kt_arr[id];
761 		if (!kd)
762 			continue;
763 		ip_vs_est_kthread_stop(kd);
764 	}
765 	mutex_unlock(&ipvs->est_mutex);
766 
767 	/* Move all estimators to est_temp_list but carefully,
768 	 * all estimators and kthread data can be released while
769 	 * we reschedule. Even for kthread 0.
770 	 */
771 	step = 0;
772 
773 	/* Order entries in est_temp_list in ascending delay, so now
774 	 * walk delay(desc), id(desc), cid(asc)
775 	 */
776 	delay = IPVS_EST_NTICKS;
777 
778 next_delay:
779 	delay--;
780 	if (delay < 0)
781 		goto end_dequeue;
782 
783 last_kt:
784 	/* Destroy contexts backwards */
785 	id = ipvs->est_kt_count;
786 
787 next_kt:
788 	if (!ipvs->enable || kthread_should_stop())
789 		goto unlock;
790 	id--;
791 	if (id < 0)
792 		goto next_delay;
793 	kd = ipvs->est_kt_arr[id];
794 	if (!kd)
795 		goto next_kt;
796 	/* kt 0 can exist with empty chains */
797 	if (!id && kd->est_count <= 1)
798 		goto next_delay;
799 
800 	row = kd->est_row + delay;
801 	if (row >= IPVS_EST_NTICKS)
802 		row -= IPVS_EST_NTICKS;
803 	td = rcu_dereference_protected(kd->ticks[row], 1);
804 	if (!td)
805 		goto next_kt;
806 
807 	cid = 0;
808 
809 walk_chain:
810 	if (kthread_should_stop())
811 		goto unlock;
812 	step++;
813 	if (!(step & 63)) {
814 		/* Give chance estimators to be added (to est_temp_list)
815 		 * and deleted (releasing kthread contexts)
816 		 */
817 		mutex_unlock(&__ip_vs_mutex);
818 		cond_resched();
819 		mutex_lock(&__ip_vs_mutex);
820 
821 		/* Current kt released ? */
822 		if (id >= ipvs->est_kt_count)
823 			goto last_kt;
824 		if (kd != ipvs->est_kt_arr[id])
825 			goto next_kt;
826 		/* Current td released ? */
827 		if (td != rcu_dereference_protected(kd->ticks[row], 1))
828 			goto next_kt;
829 		/* No fatal changes on the current kd and td */
830 	}
831 	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
832 			       list);
833 	if (!est) {
834 		cid++;
835 		if (cid >= IPVS_EST_TICK_CHAINS)
836 			goto next_kt;
837 		goto walk_chain;
838 	}
839 	/* We can cheat and increase est_count to protect kt 0 context
840 	 * from release but we prefer to keep the last estimator
841 	 */
842 	last = kd->est_count <= 1;
843 	/* Do not free kt #0 data */
844 	if (!id && last)
845 		goto next_delay;
846 	last_td = kd->tick_len[row] <= 1;
847 	stats = container_of(est, struct ip_vs_stats, est);
848 	ip_vs_stop_estimator(ipvs, stats);
849 	/* Tasks are stopped, move without RCU grace period */
850 	est->ktid = -1;
851 	est->ktrow = row - kd->est_row;
852 	if (est->ktrow < 0)
853 		est->ktrow += IPVS_EST_NTICKS;
854 	hlist_add_head(&est->list, &ipvs->est_temp_list);
855 	/* kd freed ? */
856 	if (last)
857 		goto next_kt;
858 	/* td freed ? */
859 	if (last_td)
860 		goto next_kt;
861 	goto walk_chain;
862 
863 end_dequeue:
864 	/* All estimators removed while calculating ? */
865 	if (!ipvs->est_kt_count)
866 		goto unlock;
867 	kd = ipvs->est_kt_arr[0];
868 	if (!kd)
869 		goto unlock;
870 	kd->add_row = kd->est_row;
871 	ipvs->est_chain_max = chain_max;
872 	ip_vs_est_set_params(ipvs, kd);
873 
874 	pr_info("using max %d ests per chain, %d per kthread\n",
875 		kd->chain_max, kd->est_max_count);
876 
877 	/* Try to keep tot_stats in kt0, enqueue it early */
878 	if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
879 	    ipvs->tot_stats->s.est.ktid == -1) {
880 		hlist_del(&ipvs->tot_stats->s.est.list);
881 		hlist_add_head(&ipvs->tot_stats->s.est.list,
882 			       &ipvs->est_temp_list);
883 	}
884 
885 	mutex_lock(&ipvs->est_mutex);
886 
887 	/* We completed the calc phase, new calc phase not requested */
888 	if (genid == atomic_read(&ipvs->est_genid))
889 		ipvs->est_calc_phase = 0;
890 
891 unlock2:
892 	mutex_unlock(&ipvs->est_mutex);
893 
894 unlock:
895 	mutex_unlock(&__ip_vs_mutex);
896 }
897 
ip_vs_zero_estimator(struct ip_vs_stats * stats)898 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
899 {
900 	struct ip_vs_estimator *est = &stats->est;
901 	struct ip_vs_kstats *k = &stats->kstats;
902 
903 	/* reset counters, caller must hold the stats->lock lock */
904 	est->last_inbytes = k->inbytes;
905 	est->last_outbytes = k->outbytes;
906 	est->last_conns = k->conns;
907 	est->last_inpkts = k->inpkts;
908 	est->last_outpkts = k->outpkts;
909 	est->cps = 0;
910 	est->inpps = 0;
911 	est->outpps = 0;
912 	est->inbps = 0;
913 	est->outbps = 0;
914 }
915 
916 /* Get decoded rates */
ip_vs_read_estimator(struct ip_vs_kstats * dst,struct ip_vs_stats * stats)917 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
918 {
919 	struct ip_vs_estimator *e = &stats->est;
920 
921 	dst->cps = (e->cps + 0x1FF) >> 10;
922 	dst->inpps = (e->inpps + 0x1FF) >> 10;
923 	dst->outpps = (e->outpps + 0x1FF) >> 10;
924 	dst->inbps = (e->inbps + 0xF) >> 5;
925 	dst->outbps = (e->outbps + 0xF) >> 5;
926 }
927 
ip_vs_estimator_net_init(struct netns_ipvs * ipvs)928 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
929 {
930 	INIT_HLIST_HEAD(&ipvs->est_temp_list);
931 	ipvs->est_kt_arr = NULL;
932 	ipvs->est_max_threads = 0;
933 	ipvs->est_calc_phase = 0;
934 	ipvs->est_chain_max = 0;
935 	ipvs->est_kt_count = 0;
936 	ipvs->est_add_ktid = 0;
937 	atomic_set(&ipvs->est_genid, 0);
938 	atomic_set(&ipvs->est_genid_done, 0);
939 	__mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
940 	return 0;
941 }
942 
ip_vs_estimator_net_cleanup(struct netns_ipvs * ipvs)943 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
944 {
945 	int i;
946 
947 	for (i = 0; i < ipvs->est_kt_count; i++)
948 		ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
949 	kfree(ipvs->est_kt_arr);
950 	mutex_destroy(&ipvs->est_mutex);
951 }
952