xref: /openbmc/linux/net/sched/sch_tbf.c (revision 545e4006)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/pkt_sched.h>
23 
24 
25 /*	Simple Token Bucket Filter.
26 	=======================================
27 
28 	SOURCE.
29 	-------
30 
31 	None.
32 
33 	Description.
34 	------------
35 
36 	A data flow obeys TBF with rate R and depth B, if for any
37 	time interval t_i...t_f the number of transmitted bits
38 	does not exceed B + R*(t_f-t_i).
39 
40 	Packetized version of this definition:
41 	The sequence of packets of sizes s_i served at moments t_i
42 	obeys TBF, if for any i<=k:
43 
44 	s_i+....+s_k <= B + R*(t_k - t_i)
45 
46 	Algorithm.
47 	----------
48 
49 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
50 
51 	N(t+delta) = min{B/R, N(t) + delta}
52 
53 	If the first packet in queue has length S, it may be
54 	transmitted only at the time t_* when S/R <= N(t_*),
55 	and in this case N(t) jumps:
56 
57 	N(t_* + 0) = N(t_* - 0) - S/R.
58 
59 
60 
61 	Actually, QoS requires two TBF to be applied to a data stream.
62 	One of them controls steady state burst size, another
63 	one with rate P (peak rate) and depth M (equal to link MTU)
64 	limits bursts at a smaller time scale.
65 
66 	It is easy to see that P>R, and B>M. If P is infinity, this double
67 	TBF is equivalent to a single one.
68 
69 	When TBF works in reshaping mode, latency is estimated as:
70 
71 	lat = max ((L-B)/R, (L-M)/P)
72 
73 
74 	NOTES.
75 	------
76 
77 	If TBF throttles, it starts a watchdog timer, which will wake it up
78 	when it is ready to transmit.
79 	Note that the minimal timer resolution is 1/HZ.
80 	If no new packets arrive during this period,
81 	or if the device is not awaken by EOI for some previous packet,
82 	TBF can stop its activity for 1/HZ.
83 
84 
85 	This means, that with depth B, the maximal rate is
86 
87 	R_crit = B*HZ
88 
89 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
90 
91 	Note that the peak rate TBF is much more tough: with MTU 1500
92 	P_crit = 150Kbytes/sec. So, if you need greater peak
93 	rates, use alpha with HZ=1000 :-)
94 
95 	With classful TBF, limit is just kept for backwards compatibility.
96 	It is passed to the default bfifo qdisc - if the inner qdisc is
97 	changed the limit is not effective anymore.
98 */
99 
100 struct tbf_sched_data
101 {
102 /* Parameters */
103 	u32		limit;		/* Maximal length of backlog: bytes */
104 	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
105 	u32		mtu;
106 	u32		max_size;
107 	struct qdisc_rate_table	*R_tab;
108 	struct qdisc_rate_table	*P_tab;
109 
110 /* Variables */
111 	long	tokens;			/* Current number of B tokens */
112 	long	ptokens;		/* Current number of P tokens */
113 	psched_time_t	t_c;		/* Time check-point */
114 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116 };
117 
118 #define L2T(q,L)   qdisc_l2t((q)->R_tab,L)
119 #define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
120 
121 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
122 {
123 	struct tbf_sched_data *q = qdisc_priv(sch);
124 	int ret;
125 
126 	if (qdisc_pkt_len(skb) > q->max_size) {
127 		sch->qstats.drops++;
128 #ifdef CONFIG_NET_CLS_ACT
129 		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
130 #endif
131 			kfree_skb(skb);
132 
133 		return NET_XMIT_DROP;
134 	}
135 
136 	ret = qdisc_enqueue(skb, q->qdisc);
137 	if (ret != 0) {
138 		sch->qstats.drops++;
139 		return ret;
140 	}
141 
142 	sch->q.qlen++;
143 	sch->bstats.bytes += qdisc_pkt_len(skb);
144 	sch->bstats.packets++;
145 	return 0;
146 }
147 
148 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
149 {
150 	struct tbf_sched_data *q = qdisc_priv(sch);
151 	int ret;
152 
153 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
154 		sch->q.qlen++;
155 		sch->qstats.requeues++;
156 	}
157 
158 	return ret;
159 }
160 
161 static unsigned int tbf_drop(struct Qdisc* sch)
162 {
163 	struct tbf_sched_data *q = qdisc_priv(sch);
164 	unsigned int len = 0;
165 
166 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
167 		sch->q.qlen--;
168 		sch->qstats.drops++;
169 	}
170 	return len;
171 }
172 
173 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
174 {
175 	struct tbf_sched_data *q = qdisc_priv(sch);
176 	struct sk_buff *skb;
177 
178 	skb = q->qdisc->dequeue(q->qdisc);
179 
180 	if (skb) {
181 		psched_time_t now;
182 		long toks;
183 		long ptoks = 0;
184 		unsigned int len = qdisc_pkt_len(skb);
185 
186 		now = psched_get_time();
187 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
188 
189 		if (q->P_tab) {
190 			ptoks = toks + q->ptokens;
191 			if (ptoks > (long)q->mtu)
192 				ptoks = q->mtu;
193 			ptoks -= L2T_P(q, len);
194 		}
195 		toks += q->tokens;
196 		if (toks > (long)q->buffer)
197 			toks = q->buffer;
198 		toks -= L2T(q, len);
199 
200 		if ((toks|ptoks) >= 0) {
201 			q->t_c = now;
202 			q->tokens = toks;
203 			q->ptokens = ptoks;
204 			sch->q.qlen--;
205 			sch->flags &= ~TCQ_F_THROTTLED;
206 			return skb;
207 		}
208 
209 		qdisc_watchdog_schedule(&q->watchdog,
210 					now + max_t(long, -toks, -ptoks));
211 
212 		/* Maybe we have a shorter packet in the queue,
213 		   which can be sent now. It sounds cool,
214 		   but, however, this is wrong in principle.
215 		   We MUST NOT reorder packets under these circumstances.
216 
217 		   Really, if we split the flow into independent
218 		   subflows, it would be a very good solution.
219 		   This is the main idea of all FQ algorithms
220 		   (cf. CSZ, HPFQ, HFSC)
221 		 */
222 
223 		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
224 			/* When requeue fails skb is dropped */
225 			qdisc_tree_decrease_qlen(q->qdisc, 1);
226 			sch->qstats.drops++;
227 		}
228 
229 		sch->qstats.overlimits++;
230 	}
231 	return NULL;
232 }
233 
234 static void tbf_reset(struct Qdisc* sch)
235 {
236 	struct tbf_sched_data *q = qdisc_priv(sch);
237 
238 	qdisc_reset(q->qdisc);
239 	sch->q.qlen = 0;
240 	q->t_c = psched_get_time();
241 	q->tokens = q->buffer;
242 	q->ptokens = q->mtu;
243 	qdisc_watchdog_cancel(&q->watchdog);
244 }
245 
246 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
247 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
248 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
249 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
250 };
251 
252 static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
253 {
254 	int err;
255 	struct tbf_sched_data *q = qdisc_priv(sch);
256 	struct nlattr *tb[TCA_TBF_PTAB + 1];
257 	struct tc_tbf_qopt *qopt;
258 	struct qdisc_rate_table *rtab = NULL;
259 	struct qdisc_rate_table *ptab = NULL;
260 	struct Qdisc *child = NULL;
261 	int max_size,n;
262 
263 	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
264 	if (err < 0)
265 		return err;
266 
267 	err = -EINVAL;
268 	if (tb[TCA_TBF_PARMS] == NULL)
269 		goto done;
270 
271 	qopt = nla_data(tb[TCA_TBF_PARMS]);
272 	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
273 	if (rtab == NULL)
274 		goto done;
275 
276 	if (qopt->peakrate.rate) {
277 		if (qopt->peakrate.rate > qopt->rate.rate)
278 			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
279 		if (ptab == NULL)
280 			goto done;
281 	}
282 
283 	for (n = 0; n < 256; n++)
284 		if (rtab->data[n] > qopt->buffer) break;
285 	max_size = (n << qopt->rate.cell_log)-1;
286 	if (ptab) {
287 		int size;
288 
289 		for (n = 0; n < 256; n++)
290 			if (ptab->data[n] > qopt->mtu) break;
291 		size = (n << qopt->peakrate.cell_log)-1;
292 		if (size < max_size) max_size = size;
293 	}
294 	if (max_size < 0)
295 		goto done;
296 
297 	if (qopt->limit > 0) {
298 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
299 		if (IS_ERR(child)) {
300 			err = PTR_ERR(child);
301 			goto done;
302 		}
303 	}
304 
305 	sch_tree_lock(sch);
306 	if (child) {
307 		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
308 		qdisc_destroy(xchg(&q->qdisc, child));
309 	}
310 	q->limit = qopt->limit;
311 	q->mtu = qopt->mtu;
312 	q->max_size = max_size;
313 	q->buffer = qopt->buffer;
314 	q->tokens = q->buffer;
315 	q->ptokens = q->mtu;
316 	rtab = xchg(&q->R_tab, rtab);
317 	ptab = xchg(&q->P_tab, ptab);
318 	sch_tree_unlock(sch);
319 	err = 0;
320 done:
321 	if (rtab)
322 		qdisc_put_rtab(rtab);
323 	if (ptab)
324 		qdisc_put_rtab(ptab);
325 	return err;
326 }
327 
328 static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
329 {
330 	struct tbf_sched_data *q = qdisc_priv(sch);
331 
332 	if (opt == NULL)
333 		return -EINVAL;
334 
335 	q->t_c = psched_get_time();
336 	qdisc_watchdog_init(&q->watchdog, sch);
337 	q->qdisc = &noop_qdisc;
338 
339 	return tbf_change(sch, opt);
340 }
341 
342 static void tbf_destroy(struct Qdisc *sch)
343 {
344 	struct tbf_sched_data *q = qdisc_priv(sch);
345 
346 	qdisc_watchdog_cancel(&q->watchdog);
347 
348 	if (q->P_tab)
349 		qdisc_put_rtab(q->P_tab);
350 	if (q->R_tab)
351 		qdisc_put_rtab(q->R_tab);
352 
353 	qdisc_destroy(q->qdisc);
354 }
355 
356 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
357 {
358 	struct tbf_sched_data *q = qdisc_priv(sch);
359 	struct nlattr *nest;
360 	struct tc_tbf_qopt opt;
361 
362 	nest = nla_nest_start(skb, TCA_OPTIONS);
363 	if (nest == NULL)
364 		goto nla_put_failure;
365 
366 	opt.limit = q->limit;
367 	opt.rate = q->R_tab->rate;
368 	if (q->P_tab)
369 		opt.peakrate = q->P_tab->rate;
370 	else
371 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
372 	opt.mtu = q->mtu;
373 	opt.buffer = q->buffer;
374 	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
375 
376 	nla_nest_end(skb, nest);
377 	return skb->len;
378 
379 nla_put_failure:
380 	nla_nest_cancel(skb, nest);
381 	return -1;
382 }
383 
384 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
385 			  struct sk_buff *skb, struct tcmsg *tcm)
386 {
387 	struct tbf_sched_data *q = qdisc_priv(sch);
388 
389 	if (cl != 1) 	/* only one class */
390 		return -ENOENT;
391 
392 	tcm->tcm_handle |= TC_H_MIN(1);
393 	tcm->tcm_info = q->qdisc->handle;
394 
395 	return 0;
396 }
397 
398 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
399 		     struct Qdisc **old)
400 {
401 	struct tbf_sched_data *q = qdisc_priv(sch);
402 
403 	if (new == NULL)
404 		new = &noop_qdisc;
405 
406 	sch_tree_lock(sch);
407 	*old = xchg(&q->qdisc, new);
408 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
409 	qdisc_reset(*old);
410 	sch_tree_unlock(sch);
411 
412 	return 0;
413 }
414 
415 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
416 {
417 	struct tbf_sched_data *q = qdisc_priv(sch);
418 	return q->qdisc;
419 }
420 
421 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
422 {
423 	return 1;
424 }
425 
426 static void tbf_put(struct Qdisc *sch, unsigned long arg)
427 {
428 }
429 
430 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
431 			    struct nlattr **tca, unsigned long *arg)
432 {
433 	return -ENOSYS;
434 }
435 
436 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
437 {
438 	return -ENOSYS;
439 }
440 
441 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
442 {
443 	if (!walker->stop) {
444 		if (walker->count >= walker->skip)
445 			if (walker->fn(sch, 1, walker) < 0) {
446 				walker->stop = 1;
447 				return;
448 			}
449 		walker->count++;
450 	}
451 }
452 
453 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
454 {
455 	return NULL;
456 }
457 
458 static const struct Qdisc_class_ops tbf_class_ops =
459 {
460 	.graft		=	tbf_graft,
461 	.leaf		=	tbf_leaf,
462 	.get		=	tbf_get,
463 	.put		=	tbf_put,
464 	.change		=	tbf_change_class,
465 	.delete		=	tbf_delete,
466 	.walk		=	tbf_walk,
467 	.tcf_chain	=	tbf_find_tcf,
468 	.dump		=	tbf_dump_class,
469 };
470 
471 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
472 	.next		=	NULL,
473 	.cl_ops		=	&tbf_class_ops,
474 	.id		=	"tbf",
475 	.priv_size	=	sizeof(struct tbf_sched_data),
476 	.enqueue	=	tbf_enqueue,
477 	.dequeue	=	tbf_dequeue,
478 	.requeue	=	tbf_requeue,
479 	.drop		=	tbf_drop,
480 	.init		=	tbf_init,
481 	.reset		=	tbf_reset,
482 	.destroy	=	tbf_destroy,
483 	.change		=	tbf_change,
484 	.dump		=	tbf_dump,
485 	.owner		=	THIS_MODULE,
486 };
487 
488 static int __init tbf_module_init(void)
489 {
490 	return register_qdisc(&tbf_qdisc_ops);
491 }
492 
493 static void __exit tbf_module_exit(void)
494 {
495 	unregister_qdisc(&tbf_qdisc_ops);
496 }
497 module_init(tbf_module_init)
498 module_exit(tbf_module_exit)
499 MODULE_LICENSE("GPL");
500