xref: /openbmc/linux/net/core/gen_estimator.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * net/sched/gen_estimator.c	Simple rate estimator.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Changes:
12  *              Jamal Hadi Salim - moved it to net/core and reshulfed
13  *              names to make it usable in general net subsystem.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/jiffies.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/rbtree.h>
35 #include <net/sock.h>
36 #include <net/gen_stats.h>
37 
38 /*
39    This code is NOT intended to be used for statistics collection,
40    its purpose is to provide a base for statistical multiplexing
41    for controlled load service.
42    If you need only statistics, run a user level daemon which
43    periodically reads byte counters.
44 
45    Unfortunately, rate estimation is not a very easy task.
46    F.e. I did not find a simple way to estimate the current peak rate
47    and even failed to formulate the problem 8)8)
48 
49    So I preferred not to built an estimator into the scheduler,
50    but run this task separately.
51    Ideally, it should be kernel thread(s), but for now it runs
52    from timers, which puts apparent top bounds on the number of rated
53    flows, has minimal overhead on small, but is enough
54    to handle controlled load service, sets of aggregates.
55 
56    We measure rate over A=(1<<interval) seconds and evaluate EWMA:
57 
58    avrate = avrate*(1-W) + rate*W
59 
60    where W is chosen as negative power of 2: W = 2^(-ewma_log)
61 
62    The resulting time constant is:
63 
64    T = A/(-ln(1-W))
65 
66 
67    NOTES.
68 
69    * The stored value for avbps is scaled by 2^5, so that maximal
70      rate is ~1Gbit, avpps is scaled by 2^10.
71 
72    * Minimal interval is HZ/4=250msec (it is the greatest common divisor
73      for HZ=100 and HZ=1024 8)), maximal interval
74      is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
75      are too expensive, longer ones can be implemented
76      at user level painlessly.
77  */
78 
79 #define EST_MAX_INTERVAL	5
80 
81 struct gen_estimator
82 {
83 	struct list_head	list;
84 	struct gnet_stats_basic	*bstats;
85 	struct gnet_stats_rate_est	*rate_est;
86 	spinlock_t		*stats_lock;
87 	int			ewma_log;
88 	u64			last_bytes;
89 	u32			last_packets;
90 	u32			avpps;
91 	u32			avbps;
92 	struct rcu_head		e_rcu;
93 	struct rb_node		node;
94 };
95 
96 struct gen_estimator_head
97 {
98 	struct timer_list	timer;
99 	struct list_head	list;
100 };
101 
102 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
103 
104 /* Protects against NULL dereference */
105 static DEFINE_RWLOCK(est_lock);
106 
107 /* Protects against soft lockup during large deletion */
108 static struct rb_root est_root = RB_ROOT;
109 
110 static void est_timer(unsigned long arg)
111 {
112 	int idx = (int)arg;
113 	struct gen_estimator *e;
114 
115 	rcu_read_lock();
116 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
117 		u64 nbytes;
118 		u32 npackets;
119 		u32 rate;
120 
121 		spin_lock(e->stats_lock);
122 		read_lock(&est_lock);
123 		if (e->bstats == NULL)
124 			goto skip;
125 
126 		nbytes = e->bstats->bytes;
127 		npackets = e->bstats->packets;
128 		rate = (nbytes - e->last_bytes)<<(7 - idx);
129 		e->last_bytes = nbytes;
130 		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
131 		e->rate_est->bps = (e->avbps+0xF)>>5;
132 
133 		rate = (npackets - e->last_packets)<<(12 - idx);
134 		e->last_packets = npackets;
135 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
136 		e->rate_est->pps = (e->avpps+0x1FF)>>10;
137 skip:
138 		read_unlock(&est_lock);
139 		spin_unlock(e->stats_lock);
140 	}
141 
142 	if (!list_empty(&elist[idx].list))
143 		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
144 	rcu_read_unlock();
145 }
146 
147 static void gen_add_node(struct gen_estimator *est)
148 {
149 	struct rb_node **p = &est_root.rb_node, *parent = NULL;
150 
151 	while (*p) {
152 		struct gen_estimator *e;
153 
154 		parent = *p;
155 		e = rb_entry(parent, struct gen_estimator, node);
156 
157 		if (est->bstats > e->bstats)
158 			p = &parent->rb_right;
159 		else
160 			p = &parent->rb_left;
161 	}
162 	rb_link_node(&est->node, parent, p);
163 	rb_insert_color(&est->node, &est_root);
164 }
165 
166 static
167 struct gen_estimator *gen_find_node(const struct gnet_stats_basic *bstats,
168 				    const struct gnet_stats_rate_est *rate_est)
169 {
170 	struct rb_node *p = est_root.rb_node;
171 
172 	while (p) {
173 		struct gen_estimator *e;
174 
175 		e = rb_entry(p, struct gen_estimator, node);
176 
177 		if (bstats > e->bstats)
178 			p = p->rb_right;
179 		else if (bstats < e->bstats || rate_est != e->rate_est)
180 			p = p->rb_left;
181 		else
182 			return e;
183 	}
184 	return NULL;
185 }
186 
187 /**
188  * gen_new_estimator - create a new rate estimator
189  * @bstats: basic statistics
190  * @rate_est: rate estimator statistics
191  * @stats_lock: statistics lock
192  * @opt: rate estimator configuration TLV
193  *
194  * Creates a new rate estimator with &bstats as source and &rate_est
195  * as destination. A new timer with the interval specified in the
196  * configuration TLV is created. Upon each interval, the latest statistics
197  * will be read from &bstats and the estimated rate will be stored in
198  * &rate_est with the statistics lock grabed during this period.
199  *
200  * Returns 0 on success or a negative error code.
201  *
202  * NOTE: Called under rtnl_mutex
203  */
204 int gen_new_estimator(struct gnet_stats_basic *bstats,
205 		      struct gnet_stats_rate_est *rate_est,
206 		      spinlock_t *stats_lock,
207 		      struct nlattr *opt)
208 {
209 	struct gen_estimator *est;
210 	struct gnet_estimator *parm = nla_data(opt);
211 	int idx;
212 
213 	if (nla_len(opt) < sizeof(*parm))
214 		return -EINVAL;
215 
216 	if (parm->interval < -2 || parm->interval > 3)
217 		return -EINVAL;
218 
219 	est = kzalloc(sizeof(*est), GFP_KERNEL);
220 	if (est == NULL)
221 		return -ENOBUFS;
222 
223 	idx = parm->interval + 2;
224 	est->bstats = bstats;
225 	est->rate_est = rate_est;
226 	est->stats_lock = stats_lock;
227 	est->ewma_log = parm->ewma_log;
228 	est->last_bytes = bstats->bytes;
229 	est->avbps = rate_est->bps<<5;
230 	est->last_packets = bstats->packets;
231 	est->avpps = rate_est->pps<<10;
232 
233 	if (!elist[idx].timer.function) {
234 		INIT_LIST_HEAD(&elist[idx].list);
235 		setup_timer(&elist[idx].timer, est_timer, idx);
236 	}
237 
238 	if (list_empty(&elist[idx].list))
239 		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
240 
241 	list_add_rcu(&est->list, &elist[idx].list);
242 	gen_add_node(est);
243 
244 	return 0;
245 }
246 EXPORT_SYMBOL(gen_new_estimator);
247 
248 static void __gen_kill_estimator(struct rcu_head *head)
249 {
250 	struct gen_estimator *e = container_of(head,
251 					struct gen_estimator, e_rcu);
252 	kfree(e);
253 }
254 
255 /**
256  * gen_kill_estimator - remove a rate estimator
257  * @bstats: basic statistics
258  * @rate_est: rate estimator statistics
259  *
260  * Removes the rate estimator specified by &bstats and &rate_est.
261  *
262  * NOTE: Called under rtnl_mutex
263  */
264 void gen_kill_estimator(struct gnet_stats_basic *bstats,
265 			struct gnet_stats_rate_est *rate_est)
266 {
267 	struct gen_estimator *e;
268 
269 	while ((e = gen_find_node(bstats, rate_est))) {
270 		rb_erase(&e->node, &est_root);
271 
272 		write_lock_bh(&est_lock);
273 		e->bstats = NULL;
274 		write_unlock_bh(&est_lock);
275 
276 		list_del_rcu(&e->list);
277 		call_rcu(&e->e_rcu, __gen_kill_estimator);
278 	}
279 }
280 EXPORT_SYMBOL(gen_kill_estimator);
281 
282 /**
283  * gen_replace_estimator - replace rate estimator configuration
284  * @bstats: basic statistics
285  * @rate_est: rate estimator statistics
286  * @stats_lock: statistics lock
287  * @opt: rate estimator configuration TLV
288  *
289  * Replaces the configuration of a rate estimator by calling
290  * gen_kill_estimator() and gen_new_estimator().
291  *
292  * Returns 0 on success or a negative error code.
293  */
294 int gen_replace_estimator(struct gnet_stats_basic *bstats,
295 			  struct gnet_stats_rate_est *rate_est,
296 			  spinlock_t *stats_lock, struct nlattr *opt)
297 {
298 	gen_kill_estimator(bstats, rate_est);
299 	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
300 }
301 EXPORT_SYMBOL(gen_replace_estimator);
302 
303 /**
304  * gen_estimator_active - test if estimator is currently in use
305  * @bstats: basic statistics
306  * @rate_est: rate estimator statistics
307  *
308  * Returns true if estimator is active, and false if not.
309  */
310 bool gen_estimator_active(const struct gnet_stats_basic *bstats,
311 			  const struct gnet_stats_rate_est *rate_est)
312 {
313 	ASSERT_RTNL();
314 
315 	return gen_find_node(bstats, rate_est) != NULL;
316 }
317 EXPORT_SYMBOL(gen_estimator_active);
318