xref: /openbmc/linux/net/core/gen_estimator.c (revision 384740dc)
1 /*
2  * net/sched/gen_estimator.c	Simple rate estimator.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Changes:
12  *              Jamal Hadi Salim - moved it to net/core and reshulfed
13  *              names to make it usable in general net subsystem.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/jiffies.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <net/sock.h>
35 #include <net/gen_stats.h>
36 
37 /*
38    This code is NOT intended to be used for statistics collection,
39    its purpose is to provide a base for statistical multiplexing
40    for controlled load service.
41    If you need only statistics, run a user level daemon which
42    periodically reads byte counters.
43 
44    Unfortunately, rate estimation is not a very easy task.
45    F.e. I did not find a simple way to estimate the current peak rate
46    and even failed to formulate the problem 8)8)
47 
48    So I preferred not to built an estimator into the scheduler,
49    but run this task separately.
50    Ideally, it should be kernel thread(s), but for now it runs
51    from timers, which puts apparent top bounds on the number of rated
52    flows, has minimal overhead on small, but is enough
53    to handle controlled load service, sets of aggregates.
54 
55    We measure rate over A=(1<<interval) seconds and evaluate EWMA:
56 
57    avrate = avrate*(1-W) + rate*W
58 
59    where W is chosen as negative power of 2: W = 2^(-ewma_log)
60 
61    The resulting time constant is:
62 
63    T = A/(-ln(1-W))
64 
65 
66    NOTES.
67 
68    * The stored value for avbps is scaled by 2^5, so that maximal
69      rate is ~1Gbit, avpps is scaled by 2^10.
70 
71    * Minimal interval is HZ/4=250msec (it is the greatest common divisor
72      for HZ=100 and HZ=1024 8)), maximal interval
73      is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
74      are too expensive, longer ones can be implemented
75      at user level painlessly.
76  */
77 
78 #define EST_MAX_INTERVAL	5
79 
80 struct gen_estimator
81 {
82 	struct list_head	list;
83 	struct gnet_stats_basic	*bstats;
84 	struct gnet_stats_rate_est	*rate_est;
85 	spinlock_t		*stats_lock;
86 	int			ewma_log;
87 	u64			last_bytes;
88 	u32			last_packets;
89 	u32			avpps;
90 	u32			avbps;
91 	struct rcu_head		e_rcu;
92 };
93 
94 struct gen_estimator_head
95 {
96 	struct timer_list	timer;
97 	struct list_head	list;
98 };
99 
100 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
101 
102 /* Protects against NULL dereference */
103 static DEFINE_RWLOCK(est_lock);
104 
105 static void est_timer(unsigned long arg)
106 {
107 	int idx = (int)arg;
108 	struct gen_estimator *e;
109 
110 	rcu_read_lock();
111 	list_for_each_entry_rcu(e, &elist[idx].list, list) {
112 		u64 nbytes;
113 		u32 npackets;
114 		u32 rate;
115 
116 		spin_lock(e->stats_lock);
117 		read_lock(&est_lock);
118 		if (e->bstats == NULL)
119 			goto skip;
120 
121 		nbytes = e->bstats->bytes;
122 		npackets = e->bstats->packets;
123 		rate = (nbytes - e->last_bytes)<<(7 - idx);
124 		e->last_bytes = nbytes;
125 		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
126 		e->rate_est->bps = (e->avbps+0xF)>>5;
127 
128 		rate = (npackets - e->last_packets)<<(12 - idx);
129 		e->last_packets = npackets;
130 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
131 		e->rate_est->pps = (e->avpps+0x1FF)>>10;
132 skip:
133 		read_unlock(&est_lock);
134 		spin_unlock(e->stats_lock);
135 	}
136 
137 	if (!list_empty(&elist[idx].list))
138 		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
139 	rcu_read_unlock();
140 }
141 
142 /**
143  * gen_new_estimator - create a new rate estimator
144  * @bstats: basic statistics
145  * @rate_est: rate estimator statistics
146  * @stats_lock: statistics lock
147  * @opt: rate estimator configuration TLV
148  *
149  * Creates a new rate estimator with &bstats as source and &rate_est
150  * as destination. A new timer with the interval specified in the
151  * configuration TLV is created. Upon each interval, the latest statistics
152  * will be read from &bstats and the estimated rate will be stored in
153  * &rate_est with the statistics lock grabed during this period.
154  *
155  * Returns 0 on success or a negative error code.
156  *
157  * NOTE: Called under rtnl_mutex
158  */
159 int gen_new_estimator(struct gnet_stats_basic *bstats,
160 		      struct gnet_stats_rate_est *rate_est,
161 		      spinlock_t *stats_lock,
162 		      struct nlattr *opt)
163 {
164 	struct gen_estimator *est;
165 	struct gnet_estimator *parm = nla_data(opt);
166 	int idx;
167 
168 	if (nla_len(opt) < sizeof(*parm))
169 		return -EINVAL;
170 
171 	if (parm->interval < -2 || parm->interval > 3)
172 		return -EINVAL;
173 
174 	est = kzalloc(sizeof(*est), GFP_KERNEL);
175 	if (est == NULL)
176 		return -ENOBUFS;
177 
178 	idx = parm->interval + 2;
179 	est->bstats = bstats;
180 	est->rate_est = rate_est;
181 	est->stats_lock = stats_lock;
182 	est->ewma_log = parm->ewma_log;
183 	est->last_bytes = bstats->bytes;
184 	est->avbps = rate_est->bps<<5;
185 	est->last_packets = bstats->packets;
186 	est->avpps = rate_est->pps<<10;
187 
188 	if (!elist[idx].timer.function) {
189 		INIT_LIST_HEAD(&elist[idx].list);
190 		setup_timer(&elist[idx].timer, est_timer, idx);
191 	}
192 
193 	if (list_empty(&elist[idx].list))
194 		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
195 
196 	list_add_rcu(&est->list, &elist[idx].list);
197 	return 0;
198 }
199 
200 static void __gen_kill_estimator(struct rcu_head *head)
201 {
202 	struct gen_estimator *e = container_of(head,
203 					struct gen_estimator, e_rcu);
204 	kfree(e);
205 }
206 
207 /**
208  * gen_kill_estimator - remove a rate estimator
209  * @bstats: basic statistics
210  * @rate_est: rate estimator statistics
211  *
212  * Removes the rate estimator specified by &bstats and &rate_est
213  * and deletes the timer.
214  *
215  * NOTE: Called under rtnl_mutex
216  */
217 void gen_kill_estimator(struct gnet_stats_basic *bstats,
218 	struct gnet_stats_rate_est *rate_est)
219 {
220 	int idx;
221 	struct gen_estimator *e, *n;
222 
223 	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
224 
225 		/* Skip non initialized indexes */
226 		if (!elist[idx].timer.function)
227 			continue;
228 
229 		list_for_each_entry_safe(e, n, &elist[idx].list, list) {
230 			if (e->rate_est != rate_est || e->bstats != bstats)
231 				continue;
232 
233 			write_lock_bh(&est_lock);
234 			e->bstats = NULL;
235 			write_unlock_bh(&est_lock);
236 
237 			list_del_rcu(&e->list);
238 			call_rcu(&e->e_rcu, __gen_kill_estimator);
239 		}
240 	}
241 }
242 
243 /**
244  * gen_replace_estimator - replace rate estimator configuration
245  * @bstats: basic statistics
246  * @rate_est: rate estimator statistics
247  * @stats_lock: statistics lock
248  * @opt: rate estimator configuration TLV
249  *
250  * Replaces the configuration of a rate estimator by calling
251  * gen_kill_estimator() and gen_new_estimator().
252  *
253  * Returns 0 on success or a negative error code.
254  */
255 int gen_replace_estimator(struct gnet_stats_basic *bstats,
256 			  struct gnet_stats_rate_est *rate_est,
257 			  spinlock_t *stats_lock, struct nlattr *opt)
258 {
259 	gen_kill_estimator(bstats, rate_est);
260 	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
261 }
262 
263 
264 EXPORT_SYMBOL(gen_kill_estimator);
265 EXPORT_SYMBOL(gen_new_estimator);
266 EXPORT_SYMBOL(gen_replace_estimator);
267