xref: /openbmc/linux/net/sched/sch_tbf.c (revision 1e90474c377e92db7262a8968a45c1dd980ca9e5)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/pkt_sched.h>
23 
24 
25 /*	Simple Token Bucket Filter.
26 	=======================================
27 
28 	SOURCE.
29 	-------
30 
31 	None.
32 
33 	Description.
34 	------------
35 
36 	A data flow obeys TBF with rate R and depth B, if for any
37 	time interval t_i...t_f the number of transmitted bits
38 	does not exceed B + R*(t_f-t_i).
39 
40 	Packetized version of this definition:
41 	The sequence of packets of sizes s_i served at moments t_i
42 	obeys TBF, if for any i<=k:
43 
44 	s_i+....+s_k <= B + R*(t_k - t_i)
45 
46 	Algorithm.
47 	----------
48 
49 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
50 
51 	N(t+delta) = min{B/R, N(t) + delta}
52 
53 	If the first packet in queue has length S, it may be
54 	transmitted only at the time t_* when S/R <= N(t_*),
55 	and in this case N(t) jumps:
56 
57 	N(t_* + 0) = N(t_* - 0) - S/R.
58 
59 
60 
61 	Actually, QoS requires two TBF to be applied to a data stream.
62 	One of them controls steady state burst size, another
63 	one with rate P (peak rate) and depth M (equal to link MTU)
64 	limits bursts at a smaller time scale.
65 
66 	It is easy to see that P>R, and B>M. If P is infinity, this double
67 	TBF is equivalent to a single one.
68 
69 	When TBF works in reshaping mode, latency is estimated as:
70 
71 	lat = max ((L-B)/R, (L-M)/P)
72 
73 
74 	NOTES.
75 	------
76 
77 	If TBF throttles, it starts a watchdog timer, which will wake it up
78 	when it is ready to transmit.
79 	Note that the minimal timer resolution is 1/HZ.
80 	If no new packets arrive during this period,
81 	or if the device is not awaken by EOI for some previous packet,
82 	TBF can stop its activity for 1/HZ.
83 
84 
85 	This means, that with depth B, the maximal rate is
86 
87 	R_crit = B*HZ
88 
89 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
90 
91 	Note that the peak rate TBF is much more tough: with MTU 1500
92 	P_crit = 150Kbytes/sec. So, if you need greater peak
93 	rates, use alpha with HZ=1000 :-)
94 
95 	With classful TBF, limit is just kept for backwards compatibility.
96 	It is passed to the default bfifo qdisc - if the inner qdisc is
97 	changed the limit is not effective anymore.
98 */
99 
100 struct tbf_sched_data
101 {
102 /* Parameters */
103 	u32		limit;		/* Maximal length of backlog: bytes */
104 	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
105 	u32		mtu;
106 	u32		max_size;
107 	struct qdisc_rate_table	*R_tab;
108 	struct qdisc_rate_table	*P_tab;
109 
110 /* Variables */
111 	long	tokens;			/* Current number of B tokens */
112 	long	ptokens;		/* Current number of P tokens */
113 	psched_time_t	t_c;		/* Time check-point */
114 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116 };
117 
118 #define L2T(q,L)   qdisc_l2t((q)->R_tab,L)
119 #define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
120 
121 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
122 {
123 	struct tbf_sched_data *q = qdisc_priv(sch);
124 	int ret;
125 
126 	if (skb->len > q->max_size) {
127 		sch->qstats.drops++;
128 #ifdef CONFIG_NET_CLS_ACT
129 		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
130 #endif
131 			kfree_skb(skb);
132 
133 		return NET_XMIT_DROP;
134 	}
135 
136 	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
137 		sch->qstats.drops++;
138 		return ret;
139 	}
140 
141 	sch->q.qlen++;
142 	sch->bstats.bytes += skb->len;
143 	sch->bstats.packets++;
144 	return 0;
145 }
146 
147 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
148 {
149 	struct tbf_sched_data *q = qdisc_priv(sch);
150 	int ret;
151 
152 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
153 		sch->q.qlen++;
154 		sch->qstats.requeues++;
155 	}
156 
157 	return ret;
158 }
159 
160 static unsigned int tbf_drop(struct Qdisc* sch)
161 {
162 	struct tbf_sched_data *q = qdisc_priv(sch);
163 	unsigned int len = 0;
164 
165 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
166 		sch->q.qlen--;
167 		sch->qstats.drops++;
168 	}
169 	return len;
170 }
171 
172 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
173 {
174 	struct tbf_sched_data *q = qdisc_priv(sch);
175 	struct sk_buff *skb;
176 
177 	skb = q->qdisc->dequeue(q->qdisc);
178 
179 	if (skb) {
180 		psched_time_t now;
181 		long toks;
182 		long ptoks = 0;
183 		unsigned int len = skb->len;
184 
185 		now = psched_get_time();
186 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
187 
188 		if (q->P_tab) {
189 			ptoks = toks + q->ptokens;
190 			if (ptoks > (long)q->mtu)
191 				ptoks = q->mtu;
192 			ptoks -= L2T_P(q, len);
193 		}
194 		toks += q->tokens;
195 		if (toks > (long)q->buffer)
196 			toks = q->buffer;
197 		toks -= L2T(q, len);
198 
199 		if ((toks|ptoks) >= 0) {
200 			q->t_c = now;
201 			q->tokens = toks;
202 			q->ptokens = ptoks;
203 			sch->q.qlen--;
204 			sch->flags &= ~TCQ_F_THROTTLED;
205 			return skb;
206 		}
207 
208 		qdisc_watchdog_schedule(&q->watchdog,
209 					now + max_t(long, -toks, -ptoks));
210 
211 		/* Maybe we have a shorter packet in the queue,
212 		   which can be sent now. It sounds cool,
213 		   but, however, this is wrong in principle.
214 		   We MUST NOT reorder packets under these circumstances.
215 
216 		   Really, if we split the flow into independent
217 		   subflows, it would be a very good solution.
218 		   This is the main idea of all FQ algorithms
219 		   (cf. CSZ, HPFQ, HFSC)
220 		 */
221 
222 		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
223 			/* When requeue fails skb is dropped */
224 			qdisc_tree_decrease_qlen(q->qdisc, 1);
225 			sch->qstats.drops++;
226 		}
227 
228 		sch->qstats.overlimits++;
229 	}
230 	return NULL;
231 }
232 
233 static void tbf_reset(struct Qdisc* sch)
234 {
235 	struct tbf_sched_data *q = qdisc_priv(sch);
236 
237 	qdisc_reset(q->qdisc);
238 	sch->q.qlen = 0;
239 	q->t_c = psched_get_time();
240 	q->tokens = q->buffer;
241 	q->ptokens = q->mtu;
242 	qdisc_watchdog_cancel(&q->watchdog);
243 }
244 
245 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
246 {
247 	struct Qdisc *q;
248 	struct nlattr *nla;
249 	int ret;
250 
251 	q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
252 			      TC_H_MAKE(sch->handle, 1));
253 	if (q) {
254 		nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)),
255 			      GFP_KERNEL);
256 		if (nla) {
257 			nla->nla_type = RTM_NEWQDISC;
258 			nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
259 			((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
260 
261 			ret = q->ops->change(q, nla);
262 			kfree(nla);
263 
264 			if (ret == 0)
265 				return q;
266 		}
267 		qdisc_destroy(q);
268 	}
269 
270 	return NULL;
271 }
272 
273 static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
274 {
275 	int err = -EINVAL;
276 	struct tbf_sched_data *q = qdisc_priv(sch);
277 	struct nlattr *tb[TCA_TBF_PTAB + 1];
278 	struct tc_tbf_qopt *qopt;
279 	struct qdisc_rate_table *rtab = NULL;
280 	struct qdisc_rate_table *ptab = NULL;
281 	struct Qdisc *child = NULL;
282 	int max_size,n;
283 
284 	if (nla_parse_nested(tb, TCA_TBF_PTAB, opt, NULL) ||
285 	    tb[TCA_TBF_PARMS] == NULL ||
286 	    nla_len(tb[TCA_TBF_PARMS]) < sizeof(*qopt))
287 		goto done;
288 
289 	qopt = nla_data(tb[TCA_TBF_PARMS]);
290 	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
291 	if (rtab == NULL)
292 		goto done;
293 
294 	if (qopt->peakrate.rate) {
295 		if (qopt->peakrate.rate > qopt->rate.rate)
296 			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
297 		if (ptab == NULL)
298 			goto done;
299 	}
300 
301 	for (n = 0; n < 256; n++)
302 		if (rtab->data[n] > qopt->buffer) break;
303 	max_size = (n << qopt->rate.cell_log)-1;
304 	if (ptab) {
305 		int size;
306 
307 		for (n = 0; n < 256; n++)
308 			if (ptab->data[n] > qopt->mtu) break;
309 		size = (n << qopt->peakrate.cell_log)-1;
310 		if (size < max_size) max_size = size;
311 	}
312 	if (max_size < 0)
313 		goto done;
314 
315 	if (qopt->limit > 0) {
316 		if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
317 			goto done;
318 	}
319 
320 	sch_tree_lock(sch);
321 	if (child) {
322 		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
323 		qdisc_destroy(xchg(&q->qdisc, child));
324 	}
325 	q->limit = qopt->limit;
326 	q->mtu = qopt->mtu;
327 	q->max_size = max_size;
328 	q->buffer = qopt->buffer;
329 	q->tokens = q->buffer;
330 	q->ptokens = q->mtu;
331 	rtab = xchg(&q->R_tab, rtab);
332 	ptab = xchg(&q->P_tab, ptab);
333 	sch_tree_unlock(sch);
334 	err = 0;
335 done:
336 	if (rtab)
337 		qdisc_put_rtab(rtab);
338 	if (ptab)
339 		qdisc_put_rtab(ptab);
340 	return err;
341 }
342 
343 static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
344 {
345 	struct tbf_sched_data *q = qdisc_priv(sch);
346 
347 	if (opt == NULL)
348 		return -EINVAL;
349 
350 	q->t_c = psched_get_time();
351 	qdisc_watchdog_init(&q->watchdog, sch);
352 	q->qdisc = &noop_qdisc;
353 
354 	return tbf_change(sch, opt);
355 }
356 
357 static void tbf_destroy(struct Qdisc *sch)
358 {
359 	struct tbf_sched_data *q = qdisc_priv(sch);
360 
361 	qdisc_watchdog_cancel(&q->watchdog);
362 
363 	if (q->P_tab)
364 		qdisc_put_rtab(q->P_tab);
365 	if (q->R_tab)
366 		qdisc_put_rtab(q->R_tab);
367 
368 	qdisc_destroy(q->qdisc);
369 }
370 
371 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
372 {
373 	struct tbf_sched_data *q = qdisc_priv(sch);
374 	unsigned char *b = skb_tail_pointer(skb);
375 	struct nlattr *nla;
376 	struct tc_tbf_qopt opt;
377 
378 	nla = (struct nlattr*)b;
379 	NLA_PUT(skb, TCA_OPTIONS, 0, NULL);
380 
381 	opt.limit = q->limit;
382 	opt.rate = q->R_tab->rate;
383 	if (q->P_tab)
384 		opt.peakrate = q->P_tab->rate;
385 	else
386 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
387 	opt.mtu = q->mtu;
388 	opt.buffer = q->buffer;
389 	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
390 	nla->nla_len = skb_tail_pointer(skb) - b;
391 
392 	return skb->len;
393 
394 nla_put_failure:
395 	nlmsg_trim(skb, b);
396 	return -1;
397 }
398 
399 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
400 			  struct sk_buff *skb, struct tcmsg *tcm)
401 {
402 	struct tbf_sched_data *q = qdisc_priv(sch);
403 
404 	if (cl != 1) 	/* only one class */
405 		return -ENOENT;
406 
407 	tcm->tcm_handle |= TC_H_MIN(1);
408 	tcm->tcm_info = q->qdisc->handle;
409 
410 	return 0;
411 }
412 
413 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
414 		     struct Qdisc **old)
415 {
416 	struct tbf_sched_data *q = qdisc_priv(sch);
417 
418 	if (new == NULL)
419 		new = &noop_qdisc;
420 
421 	sch_tree_lock(sch);
422 	*old = xchg(&q->qdisc, new);
423 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
424 	qdisc_reset(*old);
425 	sch_tree_unlock(sch);
426 
427 	return 0;
428 }
429 
430 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
431 {
432 	struct tbf_sched_data *q = qdisc_priv(sch);
433 	return q->qdisc;
434 }
435 
436 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
437 {
438 	return 1;
439 }
440 
441 static void tbf_put(struct Qdisc *sch, unsigned long arg)
442 {
443 }
444 
445 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
446 			    struct nlattr **tca, unsigned long *arg)
447 {
448 	return -ENOSYS;
449 }
450 
451 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
452 {
453 	return -ENOSYS;
454 }
455 
456 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
457 {
458 	if (!walker->stop) {
459 		if (walker->count >= walker->skip)
460 			if (walker->fn(sch, 1, walker) < 0) {
461 				walker->stop = 1;
462 				return;
463 			}
464 		walker->count++;
465 	}
466 }
467 
468 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
469 {
470 	return NULL;
471 }
472 
473 static const struct Qdisc_class_ops tbf_class_ops =
474 {
475 	.graft		=	tbf_graft,
476 	.leaf		=	tbf_leaf,
477 	.get		=	tbf_get,
478 	.put		=	tbf_put,
479 	.change		=	tbf_change_class,
480 	.delete		=	tbf_delete,
481 	.walk		=	tbf_walk,
482 	.tcf_chain	=	tbf_find_tcf,
483 	.dump		=	tbf_dump_class,
484 };
485 
486 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
487 	.next		=	NULL,
488 	.cl_ops		=	&tbf_class_ops,
489 	.id		=	"tbf",
490 	.priv_size	=	sizeof(struct tbf_sched_data),
491 	.enqueue	=	tbf_enqueue,
492 	.dequeue	=	tbf_dequeue,
493 	.requeue	=	tbf_requeue,
494 	.drop		=	tbf_drop,
495 	.init		=	tbf_init,
496 	.reset		=	tbf_reset,
497 	.destroy	=	tbf_destroy,
498 	.change		=	tbf_change,
499 	.dump		=	tbf_dump,
500 	.owner		=	THIS_MODULE,
501 };
502 
503 static int __init tbf_module_init(void)
504 {
505 	return register_qdisc(&tbf_qdisc_ops);
506 }
507 
508 static void __exit tbf_module_exit(void)
509 {
510 	unregister_qdisc(&tbf_qdisc_ops);
511 }
512 module_init(tbf_module_init)
513 module_exit(tbf_module_exit)
514 MODULE_LICENSE("GPL");
515