xref: /openbmc/linux/net/sched/sch_tbf.c (revision f42b3800)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/pkt_sched.h>
23 
24 
25 /*	Simple Token Bucket Filter.
26 	=======================================
27 
28 	SOURCE.
29 	-------
30 
31 	None.
32 
33 	Description.
34 	------------
35 
36 	A data flow obeys TBF with rate R and depth B, if for any
37 	time interval t_i...t_f the number of transmitted bits
38 	does not exceed B + R*(t_f-t_i).
39 
40 	Packetized version of this definition:
41 	The sequence of packets of sizes s_i served at moments t_i
42 	obeys TBF, if for any i<=k:
43 
44 	s_i+....+s_k <= B + R*(t_k - t_i)
45 
46 	Algorithm.
47 	----------
48 
49 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
50 
51 	N(t+delta) = min{B/R, N(t) + delta}
52 
53 	If the first packet in queue has length S, it may be
54 	transmitted only at the time t_* when S/R <= N(t_*),
55 	and in this case N(t) jumps:
56 
57 	N(t_* + 0) = N(t_* - 0) - S/R.
58 
59 
60 
61 	Actually, QoS requires two TBF to be applied to a data stream.
62 	One of them controls steady state burst size, another
63 	one with rate P (peak rate) and depth M (equal to link MTU)
64 	limits bursts at a smaller time scale.
65 
66 	It is easy to see that P>R, and B>M. If P is infinity, this double
67 	TBF is equivalent to a single one.
68 
69 	When TBF works in reshaping mode, latency is estimated as:
70 
71 	lat = max ((L-B)/R, (L-M)/P)
72 
73 
74 	NOTES.
75 	------
76 
77 	If TBF throttles, it starts a watchdog timer, which will wake it up
78 	when it is ready to transmit.
79 	Note that the minimal timer resolution is 1/HZ.
80 	If no new packets arrive during this period,
81 	or if the device is not awaken by EOI for some previous packet,
82 	TBF can stop its activity for 1/HZ.
83 
84 
85 	This means, that with depth B, the maximal rate is
86 
87 	R_crit = B*HZ
88 
89 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
90 
91 	Note that the peak rate TBF is much more tough: with MTU 1500
92 	P_crit = 150Kbytes/sec. So, if you need greater peak
93 	rates, use alpha with HZ=1000 :-)
94 
95 	With classful TBF, limit is just kept for backwards compatibility.
96 	It is passed to the default bfifo qdisc - if the inner qdisc is
97 	changed the limit is not effective anymore.
98 */
99 
100 struct tbf_sched_data
101 {
102 /* Parameters */
103 	u32		limit;		/* Maximal length of backlog: bytes */
104 	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
105 	u32		mtu;
106 	u32		max_size;
107 	struct qdisc_rate_table	*R_tab;
108 	struct qdisc_rate_table	*P_tab;
109 
110 /* Variables */
111 	long	tokens;			/* Current number of B tokens */
112 	long	ptokens;		/* Current number of P tokens */
113 	psched_time_t	t_c;		/* Time check-point */
114 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116 };
117 
118 #define L2T(q,L)   qdisc_l2t((q)->R_tab,L)
119 #define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
120 
121 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
122 {
123 	struct tbf_sched_data *q = qdisc_priv(sch);
124 	int ret;
125 
126 	if (skb->len > q->max_size) {
127 		sch->qstats.drops++;
128 #ifdef CONFIG_NET_CLS_ACT
129 		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
130 #endif
131 			kfree_skb(skb);
132 
133 		return NET_XMIT_DROP;
134 	}
135 
136 	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
137 		sch->qstats.drops++;
138 		return ret;
139 	}
140 
141 	sch->q.qlen++;
142 	sch->bstats.bytes += skb->len;
143 	sch->bstats.packets++;
144 	return 0;
145 }
146 
147 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
148 {
149 	struct tbf_sched_data *q = qdisc_priv(sch);
150 	int ret;
151 
152 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
153 		sch->q.qlen++;
154 		sch->qstats.requeues++;
155 	}
156 
157 	return ret;
158 }
159 
160 static unsigned int tbf_drop(struct Qdisc* sch)
161 {
162 	struct tbf_sched_data *q = qdisc_priv(sch);
163 	unsigned int len = 0;
164 
165 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
166 		sch->q.qlen--;
167 		sch->qstats.drops++;
168 	}
169 	return len;
170 }
171 
172 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
173 {
174 	struct tbf_sched_data *q = qdisc_priv(sch);
175 	struct sk_buff *skb;
176 
177 	skb = q->qdisc->dequeue(q->qdisc);
178 
179 	if (skb) {
180 		psched_time_t now;
181 		long toks;
182 		long ptoks = 0;
183 		unsigned int len = skb->len;
184 
185 		now = psched_get_time();
186 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
187 
188 		if (q->P_tab) {
189 			ptoks = toks + q->ptokens;
190 			if (ptoks > (long)q->mtu)
191 				ptoks = q->mtu;
192 			ptoks -= L2T_P(q, len);
193 		}
194 		toks += q->tokens;
195 		if (toks > (long)q->buffer)
196 			toks = q->buffer;
197 		toks -= L2T(q, len);
198 
199 		if ((toks|ptoks) >= 0) {
200 			q->t_c = now;
201 			q->tokens = toks;
202 			q->ptokens = ptoks;
203 			sch->q.qlen--;
204 			sch->flags &= ~TCQ_F_THROTTLED;
205 			return skb;
206 		}
207 
208 		qdisc_watchdog_schedule(&q->watchdog,
209 					now + max_t(long, -toks, -ptoks));
210 
211 		/* Maybe we have a shorter packet in the queue,
212 		   which can be sent now. It sounds cool,
213 		   but, however, this is wrong in principle.
214 		   We MUST NOT reorder packets under these circumstances.
215 
216 		   Really, if we split the flow into independent
217 		   subflows, it would be a very good solution.
218 		   This is the main idea of all FQ algorithms
219 		   (cf. CSZ, HPFQ, HFSC)
220 		 */
221 
222 		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
223 			/* When requeue fails skb is dropped */
224 			qdisc_tree_decrease_qlen(q->qdisc, 1);
225 			sch->qstats.drops++;
226 		}
227 
228 		sch->qstats.overlimits++;
229 	}
230 	return NULL;
231 }
232 
233 static void tbf_reset(struct Qdisc* sch)
234 {
235 	struct tbf_sched_data *q = qdisc_priv(sch);
236 
237 	qdisc_reset(q->qdisc);
238 	sch->q.qlen = 0;
239 	q->t_c = psched_get_time();
240 	q->tokens = q->buffer;
241 	q->ptokens = q->mtu;
242 	qdisc_watchdog_cancel(&q->watchdog);
243 }
244 
245 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
246 {
247 	struct Qdisc *q;
248 	struct nlattr *nla;
249 	int ret;
250 
251 	q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
252 			      TC_H_MAKE(sch->handle, 1));
253 	if (q) {
254 		nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)),
255 			      GFP_KERNEL);
256 		if (nla) {
257 			nla->nla_type = RTM_NEWQDISC;
258 			nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
259 			((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
260 
261 			ret = q->ops->change(q, nla);
262 			kfree(nla);
263 
264 			if (ret == 0)
265 				return q;
266 		}
267 		qdisc_destroy(q);
268 	}
269 
270 	return NULL;
271 }
272 
273 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
274 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
275 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
276 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
277 };
278 
279 static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
280 {
281 	int err;
282 	struct tbf_sched_data *q = qdisc_priv(sch);
283 	struct nlattr *tb[TCA_TBF_PTAB + 1];
284 	struct tc_tbf_qopt *qopt;
285 	struct qdisc_rate_table *rtab = NULL;
286 	struct qdisc_rate_table *ptab = NULL;
287 	struct Qdisc *child = NULL;
288 	int max_size,n;
289 
290 	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
291 	if (err < 0)
292 		return err;
293 
294 	err = -EINVAL;
295 	if (tb[TCA_TBF_PARMS] == NULL)
296 		goto done;
297 
298 	qopt = nla_data(tb[TCA_TBF_PARMS]);
299 	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
300 	if (rtab == NULL)
301 		goto done;
302 
303 	if (qopt->peakrate.rate) {
304 		if (qopt->peakrate.rate > qopt->rate.rate)
305 			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
306 		if (ptab == NULL)
307 			goto done;
308 	}
309 
310 	for (n = 0; n < 256; n++)
311 		if (rtab->data[n] > qopt->buffer) break;
312 	max_size = (n << qopt->rate.cell_log)-1;
313 	if (ptab) {
314 		int size;
315 
316 		for (n = 0; n < 256; n++)
317 			if (ptab->data[n] > qopt->mtu) break;
318 		size = (n << qopt->peakrate.cell_log)-1;
319 		if (size < max_size) max_size = size;
320 	}
321 	if (max_size < 0)
322 		goto done;
323 
324 	if (qopt->limit > 0) {
325 		if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
326 			goto done;
327 	}
328 
329 	sch_tree_lock(sch);
330 	if (child) {
331 		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
332 		qdisc_destroy(xchg(&q->qdisc, child));
333 	}
334 	q->limit = qopt->limit;
335 	q->mtu = qopt->mtu;
336 	q->max_size = max_size;
337 	q->buffer = qopt->buffer;
338 	q->tokens = q->buffer;
339 	q->ptokens = q->mtu;
340 	rtab = xchg(&q->R_tab, rtab);
341 	ptab = xchg(&q->P_tab, ptab);
342 	sch_tree_unlock(sch);
343 	err = 0;
344 done:
345 	if (rtab)
346 		qdisc_put_rtab(rtab);
347 	if (ptab)
348 		qdisc_put_rtab(ptab);
349 	return err;
350 }
351 
352 static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
353 {
354 	struct tbf_sched_data *q = qdisc_priv(sch);
355 
356 	if (opt == NULL)
357 		return -EINVAL;
358 
359 	q->t_c = psched_get_time();
360 	qdisc_watchdog_init(&q->watchdog, sch);
361 	q->qdisc = &noop_qdisc;
362 
363 	return tbf_change(sch, opt);
364 }
365 
366 static void tbf_destroy(struct Qdisc *sch)
367 {
368 	struct tbf_sched_data *q = qdisc_priv(sch);
369 
370 	qdisc_watchdog_cancel(&q->watchdog);
371 
372 	if (q->P_tab)
373 		qdisc_put_rtab(q->P_tab);
374 	if (q->R_tab)
375 		qdisc_put_rtab(q->R_tab);
376 
377 	qdisc_destroy(q->qdisc);
378 }
379 
380 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
381 {
382 	struct tbf_sched_data *q = qdisc_priv(sch);
383 	struct nlattr *nest;
384 	struct tc_tbf_qopt opt;
385 
386 	nest = nla_nest_start(skb, TCA_OPTIONS);
387 	if (nest == NULL)
388 		goto nla_put_failure;
389 
390 	opt.limit = q->limit;
391 	opt.rate = q->R_tab->rate;
392 	if (q->P_tab)
393 		opt.peakrate = q->P_tab->rate;
394 	else
395 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
396 	opt.mtu = q->mtu;
397 	opt.buffer = q->buffer;
398 	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
399 
400 	nla_nest_end(skb, nest);
401 	return skb->len;
402 
403 nla_put_failure:
404 	nla_nest_cancel(skb, nest);
405 	return -1;
406 }
407 
408 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
409 			  struct sk_buff *skb, struct tcmsg *tcm)
410 {
411 	struct tbf_sched_data *q = qdisc_priv(sch);
412 
413 	if (cl != 1) 	/* only one class */
414 		return -ENOENT;
415 
416 	tcm->tcm_handle |= TC_H_MIN(1);
417 	tcm->tcm_info = q->qdisc->handle;
418 
419 	return 0;
420 }
421 
422 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
423 		     struct Qdisc **old)
424 {
425 	struct tbf_sched_data *q = qdisc_priv(sch);
426 
427 	if (new == NULL)
428 		new = &noop_qdisc;
429 
430 	sch_tree_lock(sch);
431 	*old = xchg(&q->qdisc, new);
432 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
433 	qdisc_reset(*old);
434 	sch_tree_unlock(sch);
435 
436 	return 0;
437 }
438 
439 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
440 {
441 	struct tbf_sched_data *q = qdisc_priv(sch);
442 	return q->qdisc;
443 }
444 
445 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
446 {
447 	return 1;
448 }
449 
450 static void tbf_put(struct Qdisc *sch, unsigned long arg)
451 {
452 }
453 
454 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
455 			    struct nlattr **tca, unsigned long *arg)
456 {
457 	return -ENOSYS;
458 }
459 
460 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
461 {
462 	return -ENOSYS;
463 }
464 
465 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
466 {
467 	if (!walker->stop) {
468 		if (walker->count >= walker->skip)
469 			if (walker->fn(sch, 1, walker) < 0) {
470 				walker->stop = 1;
471 				return;
472 			}
473 		walker->count++;
474 	}
475 }
476 
477 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
478 {
479 	return NULL;
480 }
481 
482 static const struct Qdisc_class_ops tbf_class_ops =
483 {
484 	.graft		=	tbf_graft,
485 	.leaf		=	tbf_leaf,
486 	.get		=	tbf_get,
487 	.put		=	tbf_put,
488 	.change		=	tbf_change_class,
489 	.delete		=	tbf_delete,
490 	.walk		=	tbf_walk,
491 	.tcf_chain	=	tbf_find_tcf,
492 	.dump		=	tbf_dump_class,
493 };
494 
495 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
496 	.next		=	NULL,
497 	.cl_ops		=	&tbf_class_ops,
498 	.id		=	"tbf",
499 	.priv_size	=	sizeof(struct tbf_sched_data),
500 	.enqueue	=	tbf_enqueue,
501 	.dequeue	=	tbf_dequeue,
502 	.requeue	=	tbf_requeue,
503 	.drop		=	tbf_drop,
504 	.init		=	tbf_init,
505 	.reset		=	tbf_reset,
506 	.destroy	=	tbf_destroy,
507 	.change		=	tbf_change,
508 	.dump		=	tbf_dump,
509 	.owner		=	THIS_MODULE,
510 };
511 
512 static int __init tbf_module_init(void)
513 {
514 	return register_qdisc(&tbf_qdisc_ops);
515 }
516 
517 static void __exit tbf_module_exit(void)
518 {
519 	unregister_qdisc(&tbf_qdisc_ops);
520 }
521 module_init(tbf_module_init)
522 module_exit(tbf_module_exit)
523 MODULE_LICENSE("GPL");
524