xref: /openbmc/linux/net/sched/sch_tbf.c (revision c21b37f6)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/pkt_sched.h>
23 
24 
25 /*	Simple Token Bucket Filter.
26 	=======================================
27 
28 	SOURCE.
29 	-------
30 
31 	None.
32 
33 	Description.
34 	------------
35 
36 	A data flow obeys TBF with rate R and depth B, if for any
37 	time interval t_i...t_f the number of transmitted bits
38 	does not exceed B + R*(t_f-t_i).
39 
40 	Packetized version of this definition:
41 	The sequence of packets of sizes s_i served at moments t_i
42 	obeys TBF, if for any i<=k:
43 
44 	s_i+....+s_k <= B + R*(t_k - t_i)
45 
46 	Algorithm.
47 	----------
48 
49 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
50 
51 	N(t+delta) = min{B/R, N(t) + delta}
52 
53 	If the first packet in queue has length S, it may be
54 	transmitted only at the time t_* when S/R <= N(t_*),
55 	and in this case N(t) jumps:
56 
57 	N(t_* + 0) = N(t_* - 0) - S/R.
58 
59 
60 
61 	Actually, QoS requires two TBF to be applied to a data stream.
62 	One of them controls steady state burst size, another
63 	one with rate P (peak rate) and depth M (equal to link MTU)
64 	limits bursts at a smaller time scale.
65 
66 	It is easy to see that P>R, and B>M. If P is infinity, this double
67 	TBF is equivalent to a single one.
68 
69 	When TBF works in reshaping mode, latency is estimated as:
70 
71 	lat = max ((L-B)/R, (L-M)/P)
72 
73 
74 	NOTES.
75 	------
76 
77 	If TBF throttles, it starts a watchdog timer, which will wake it up
78 	when it is ready to transmit.
79 	Note that the minimal timer resolution is 1/HZ.
80 	If no new packets arrive during this period,
81 	or if the device is not awaken by EOI for some previous packet,
82 	TBF can stop its activity for 1/HZ.
83 
84 
85 	This means, that with depth B, the maximal rate is
86 
87 	R_crit = B*HZ
88 
89 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
90 
91 	Note that the peak rate TBF is much more tough: with MTU 1500
92 	P_crit = 150Kbytes/sec. So, if you need greater peak
93 	rates, use alpha with HZ=1000 :-)
94 
95 	With classful TBF, limit is just kept for backwards compatibility.
96 	It is passed to the default bfifo qdisc - if the inner qdisc is
97 	changed the limit is not effective anymore.
98 */
99 
100 struct tbf_sched_data
101 {
102 /* Parameters */
103 	u32		limit;		/* Maximal length of backlog: bytes */
104 	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
105 	u32		mtu;
106 	u32		max_size;
107 	struct qdisc_rate_table	*R_tab;
108 	struct qdisc_rate_table	*P_tab;
109 
110 /* Variables */
111 	long	tokens;			/* Current number of B tokens */
112 	long	ptokens;		/* Current number of P tokens */
113 	psched_time_t	t_c;		/* Time check-point */
114 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116 };
117 
118 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
119 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
120 
121 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
122 {
123 	struct tbf_sched_data *q = qdisc_priv(sch);
124 	int ret;
125 
126 	if (skb->len > q->max_size) {
127 		sch->qstats.drops++;
128 #ifdef CONFIG_NET_CLS_ACT
129 		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
130 #endif
131 			kfree_skb(skb);
132 
133 		return NET_XMIT_DROP;
134 	}
135 
136 	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
137 		sch->qstats.drops++;
138 		return ret;
139 	}
140 
141 	sch->q.qlen++;
142 	sch->bstats.bytes += skb->len;
143 	sch->bstats.packets++;
144 	return 0;
145 }
146 
147 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
148 {
149 	struct tbf_sched_data *q = qdisc_priv(sch);
150 	int ret;
151 
152 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
153 		sch->q.qlen++;
154 		sch->qstats.requeues++;
155 	}
156 
157 	return ret;
158 }
159 
160 static unsigned int tbf_drop(struct Qdisc* sch)
161 {
162 	struct tbf_sched_data *q = qdisc_priv(sch);
163 	unsigned int len = 0;
164 
165 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
166 		sch->q.qlen--;
167 		sch->qstats.drops++;
168 	}
169 	return len;
170 }
171 
172 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
173 {
174 	struct tbf_sched_data *q = qdisc_priv(sch);
175 	struct sk_buff *skb;
176 
177 	skb = q->qdisc->dequeue(q->qdisc);
178 
179 	if (skb) {
180 		psched_time_t now;
181 		long toks;
182 		long ptoks = 0;
183 		unsigned int len = skb->len;
184 
185 		now = psched_get_time();
186 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
187 
188 		if (q->P_tab) {
189 			ptoks = toks + q->ptokens;
190 			if (ptoks > (long)q->mtu)
191 				ptoks = q->mtu;
192 			ptoks -= L2T_P(q, len);
193 		}
194 		toks += q->tokens;
195 		if (toks > (long)q->buffer)
196 			toks = q->buffer;
197 		toks -= L2T(q, len);
198 
199 		if ((toks|ptoks) >= 0) {
200 			q->t_c = now;
201 			q->tokens = toks;
202 			q->ptokens = ptoks;
203 			sch->q.qlen--;
204 			sch->flags &= ~TCQ_F_THROTTLED;
205 			return skb;
206 		}
207 
208 		qdisc_watchdog_schedule(&q->watchdog,
209 					now + max_t(long, -toks, -ptoks));
210 
211 		/* Maybe we have a shorter packet in the queue,
212 		   which can be sent now. It sounds cool,
213 		   but, however, this is wrong in principle.
214 		   We MUST NOT reorder packets under these circumstances.
215 
216 		   Really, if we split the flow into independent
217 		   subflows, it would be a very good solution.
218 		   This is the main idea of all FQ algorithms
219 		   (cf. CSZ, HPFQ, HFSC)
220 		 */
221 
222 		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
223 			/* When requeue fails skb is dropped */
224 			qdisc_tree_decrease_qlen(q->qdisc, 1);
225 			sch->qstats.drops++;
226 		}
227 
228 		sch->qstats.overlimits++;
229 	}
230 	return NULL;
231 }
232 
233 static void tbf_reset(struct Qdisc* sch)
234 {
235 	struct tbf_sched_data *q = qdisc_priv(sch);
236 
237 	qdisc_reset(q->qdisc);
238 	sch->q.qlen = 0;
239 	q->t_c = psched_get_time();
240 	q->tokens = q->buffer;
241 	q->ptokens = q->mtu;
242 	qdisc_watchdog_cancel(&q->watchdog);
243 }
244 
245 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
246 {
247 	struct Qdisc *q;
248 	struct rtattr *rta;
249 	int ret;
250 
251 	q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
252 			      TC_H_MAKE(sch->handle, 1));
253 	if (q) {
254 		rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
255 		if (rta) {
256 			rta->rta_type = RTM_NEWQDISC;
257 			rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
258 			((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
259 
260 			ret = q->ops->change(q, rta);
261 			kfree(rta);
262 
263 			if (ret == 0)
264 				return q;
265 		}
266 		qdisc_destroy(q);
267 	}
268 
269 	return NULL;
270 }
271 
272 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
273 {
274 	int err = -EINVAL;
275 	struct tbf_sched_data *q = qdisc_priv(sch);
276 	struct rtattr *tb[TCA_TBF_PTAB];
277 	struct tc_tbf_qopt *qopt;
278 	struct qdisc_rate_table *rtab = NULL;
279 	struct qdisc_rate_table *ptab = NULL;
280 	struct Qdisc *child = NULL;
281 	int max_size,n;
282 
283 	if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
284 	    tb[TCA_TBF_PARMS-1] == NULL ||
285 	    RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
286 		goto done;
287 
288 	qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
289 	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
290 	if (rtab == NULL)
291 		goto done;
292 
293 	if (qopt->peakrate.rate) {
294 		if (qopt->peakrate.rate > qopt->rate.rate)
295 			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
296 		if (ptab == NULL)
297 			goto done;
298 	}
299 
300 	for (n = 0; n < 256; n++)
301 		if (rtab->data[n] > qopt->buffer) break;
302 	max_size = (n << qopt->rate.cell_log)-1;
303 	if (ptab) {
304 		int size;
305 
306 		for (n = 0; n < 256; n++)
307 			if (ptab->data[n] > qopt->mtu) break;
308 		size = (n << qopt->peakrate.cell_log)-1;
309 		if (size < max_size) max_size = size;
310 	}
311 	if (max_size < 0)
312 		goto done;
313 
314 	if (qopt->limit > 0) {
315 		if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
316 			goto done;
317 	}
318 
319 	sch_tree_lock(sch);
320 	if (child) {
321 		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
322 		qdisc_destroy(xchg(&q->qdisc, child));
323 	}
324 	q->limit = qopt->limit;
325 	q->mtu = qopt->mtu;
326 	q->max_size = max_size;
327 	q->buffer = qopt->buffer;
328 	q->tokens = q->buffer;
329 	q->ptokens = q->mtu;
330 	rtab = xchg(&q->R_tab, rtab);
331 	ptab = xchg(&q->P_tab, ptab);
332 	sch_tree_unlock(sch);
333 	err = 0;
334 done:
335 	if (rtab)
336 		qdisc_put_rtab(rtab);
337 	if (ptab)
338 		qdisc_put_rtab(ptab);
339 	return err;
340 }
341 
342 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
343 {
344 	struct tbf_sched_data *q = qdisc_priv(sch);
345 
346 	if (opt == NULL)
347 		return -EINVAL;
348 
349 	q->t_c = psched_get_time();
350 	qdisc_watchdog_init(&q->watchdog, sch);
351 	q->qdisc = &noop_qdisc;
352 
353 	return tbf_change(sch, opt);
354 }
355 
356 static void tbf_destroy(struct Qdisc *sch)
357 {
358 	struct tbf_sched_data *q = qdisc_priv(sch);
359 
360 	qdisc_watchdog_cancel(&q->watchdog);
361 
362 	if (q->P_tab)
363 		qdisc_put_rtab(q->P_tab);
364 	if (q->R_tab)
365 		qdisc_put_rtab(q->R_tab);
366 
367 	qdisc_destroy(q->qdisc);
368 }
369 
370 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
371 {
372 	struct tbf_sched_data *q = qdisc_priv(sch);
373 	unsigned char *b = skb_tail_pointer(skb);
374 	struct rtattr *rta;
375 	struct tc_tbf_qopt opt;
376 
377 	rta = (struct rtattr*)b;
378 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
379 
380 	opt.limit = q->limit;
381 	opt.rate = q->R_tab->rate;
382 	if (q->P_tab)
383 		opt.peakrate = q->P_tab->rate;
384 	else
385 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
386 	opt.mtu = q->mtu;
387 	opt.buffer = q->buffer;
388 	RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
389 	rta->rta_len = skb_tail_pointer(skb) - b;
390 
391 	return skb->len;
392 
393 rtattr_failure:
394 	nlmsg_trim(skb, b);
395 	return -1;
396 }
397 
398 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
399 			  struct sk_buff *skb, struct tcmsg *tcm)
400 {
401 	struct tbf_sched_data *q = qdisc_priv(sch);
402 
403 	if (cl != 1) 	/* only one class */
404 		return -ENOENT;
405 
406 	tcm->tcm_handle |= TC_H_MIN(1);
407 	tcm->tcm_info = q->qdisc->handle;
408 
409 	return 0;
410 }
411 
412 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
413 		     struct Qdisc **old)
414 {
415 	struct tbf_sched_data *q = qdisc_priv(sch);
416 
417 	if (new == NULL)
418 		new = &noop_qdisc;
419 
420 	sch_tree_lock(sch);
421 	*old = xchg(&q->qdisc, new);
422 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
423 	qdisc_reset(*old);
424 	sch_tree_unlock(sch);
425 
426 	return 0;
427 }
428 
429 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
430 {
431 	struct tbf_sched_data *q = qdisc_priv(sch);
432 	return q->qdisc;
433 }
434 
435 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
436 {
437 	return 1;
438 }
439 
440 static void tbf_put(struct Qdisc *sch, unsigned long arg)
441 {
442 }
443 
444 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
445 			    struct rtattr **tca, unsigned long *arg)
446 {
447 	return -ENOSYS;
448 }
449 
450 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
451 {
452 	return -ENOSYS;
453 }
454 
455 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
456 {
457 	if (!walker->stop) {
458 		if (walker->count >= walker->skip)
459 			if (walker->fn(sch, 1, walker) < 0) {
460 				walker->stop = 1;
461 				return;
462 			}
463 		walker->count++;
464 	}
465 }
466 
467 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
468 {
469 	return NULL;
470 }
471 
472 static struct Qdisc_class_ops tbf_class_ops =
473 {
474 	.graft		=	tbf_graft,
475 	.leaf		=	tbf_leaf,
476 	.get		=	tbf_get,
477 	.put		=	tbf_put,
478 	.change		=	tbf_change_class,
479 	.delete		=	tbf_delete,
480 	.walk		=	tbf_walk,
481 	.tcf_chain	=	tbf_find_tcf,
482 	.dump		=	tbf_dump_class,
483 };
484 
485 static struct Qdisc_ops tbf_qdisc_ops = {
486 	.next		=	NULL,
487 	.cl_ops		=	&tbf_class_ops,
488 	.id		=	"tbf",
489 	.priv_size	=	sizeof(struct tbf_sched_data),
490 	.enqueue	=	tbf_enqueue,
491 	.dequeue	=	tbf_dequeue,
492 	.requeue	=	tbf_requeue,
493 	.drop		=	tbf_drop,
494 	.init		=	tbf_init,
495 	.reset		=	tbf_reset,
496 	.destroy	=	tbf_destroy,
497 	.change		=	tbf_change,
498 	.dump		=	tbf_dump,
499 	.owner		=	THIS_MODULE,
500 };
501 
502 static int __init tbf_module_init(void)
503 {
504 	return register_qdisc(&tbf_qdisc_ops);
505 }
506 
507 static void __exit tbf_module_exit(void)
508 {
509 	unregister_qdisc(&tbf_qdisc_ops);
510 }
511 module_init(tbf_module_init)
512 module_exit(tbf_module_exit)
513 MODULE_LICENSE("GPL");
514