xref: /openbmc/linux/net/sched/sch_tbf.c (revision 64c70b1c)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/if_ether.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/etherdevice.h>
33 #include <linux/notifier.h>
34 #include <net/ip.h>
35 #include <net/netlink.h>
36 #include <net/route.h>
37 #include <linux/skbuff.h>
38 #include <net/sock.h>
39 #include <net/pkt_sched.h>
40 
41 
42 /*	Simple Token Bucket Filter.
43 	=======================================
44 
45 	SOURCE.
46 	-------
47 
48 	None.
49 
50 	Description.
51 	------------
52 
53 	A data flow obeys TBF with rate R and depth B, if for any
54 	time interval t_i...t_f the number of transmitted bits
55 	does not exceed B + R*(t_f-t_i).
56 
57 	Packetized version of this definition:
58 	The sequence of packets of sizes s_i served at moments t_i
59 	obeys TBF, if for any i<=k:
60 
61 	s_i+....+s_k <= B + R*(t_k - t_i)
62 
63 	Algorithm.
64 	----------
65 
66 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
67 
68 	N(t+delta) = min{B/R, N(t) + delta}
69 
70 	If the first packet in queue has length S, it may be
71 	transmitted only at the time t_* when S/R <= N(t_*),
72 	and in this case N(t) jumps:
73 
74 	N(t_* + 0) = N(t_* - 0) - S/R.
75 
76 
77 
78 	Actually, QoS requires two TBF to be applied to a data stream.
79 	One of them controls steady state burst size, another
80 	one with rate P (peak rate) and depth M (equal to link MTU)
81 	limits bursts at a smaller time scale.
82 
83 	It is easy to see that P>R, and B>M. If P is infinity, this double
84 	TBF is equivalent to a single one.
85 
86 	When TBF works in reshaping mode, latency is estimated as:
87 
88 	lat = max ((L-B)/R, (L-M)/P)
89 
90 
91 	NOTES.
92 	------
93 
94 	If TBF throttles, it starts a watchdog timer, which will wake it up
95 	when it is ready to transmit.
96 	Note that the minimal timer resolution is 1/HZ.
97 	If no new packets arrive during this period,
98 	or if the device is not awaken by EOI for some previous packet,
99 	TBF can stop its activity for 1/HZ.
100 
101 
102 	This means, that with depth B, the maximal rate is
103 
104 	R_crit = B*HZ
105 
106 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
107 
108 	Note that the peak rate TBF is much more tough: with MTU 1500
109 	P_crit = 150Kbytes/sec. So, if you need greater peak
110 	rates, use alpha with HZ=1000 :-)
111 
112 	With classful TBF, limit is just kept for backwards compatibility.
113 	It is passed to the default bfifo qdisc - if the inner qdisc is
114 	changed the limit is not effective anymore.
115 */
116 
117 struct tbf_sched_data
118 {
119 /* Parameters */
120 	u32		limit;		/* Maximal length of backlog: bytes */
121 	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
122 	u32		mtu;
123 	u32		max_size;
124 	struct qdisc_rate_table	*R_tab;
125 	struct qdisc_rate_table	*P_tab;
126 
127 /* Variables */
128 	long	tokens;			/* Current number of B tokens */
129 	long	ptokens;		/* Current number of P tokens */
130 	psched_time_t	t_c;		/* Time check-point */
131 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
132 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
133 };
134 
135 #define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
136 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
137 
138 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
139 {
140 	struct tbf_sched_data *q = qdisc_priv(sch);
141 	int ret;
142 
143 	if (skb->len > q->max_size) {
144 		sch->qstats.drops++;
145 #ifdef CONFIG_NET_CLS_POLICE
146 		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
147 #endif
148 			kfree_skb(skb);
149 
150 		return NET_XMIT_DROP;
151 	}
152 
153 	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
154 		sch->qstats.drops++;
155 		return ret;
156 	}
157 
158 	sch->q.qlen++;
159 	sch->bstats.bytes += skb->len;
160 	sch->bstats.packets++;
161 	return 0;
162 }
163 
164 static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
165 {
166 	struct tbf_sched_data *q = qdisc_priv(sch);
167 	int ret;
168 
169 	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
170 		sch->q.qlen++;
171 		sch->qstats.requeues++;
172 	}
173 
174 	return ret;
175 }
176 
177 static unsigned int tbf_drop(struct Qdisc* sch)
178 {
179 	struct tbf_sched_data *q = qdisc_priv(sch);
180 	unsigned int len = 0;
181 
182 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
183 		sch->q.qlen--;
184 		sch->qstats.drops++;
185 	}
186 	return len;
187 }
188 
189 static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
190 {
191 	struct tbf_sched_data *q = qdisc_priv(sch);
192 	struct sk_buff *skb;
193 
194 	skb = q->qdisc->dequeue(q->qdisc);
195 
196 	if (skb) {
197 		psched_time_t now;
198 		long toks;
199 		long ptoks = 0;
200 		unsigned int len = skb->len;
201 
202 		now = psched_get_time();
203 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
204 
205 		if (q->P_tab) {
206 			ptoks = toks + q->ptokens;
207 			if (ptoks > (long)q->mtu)
208 				ptoks = q->mtu;
209 			ptoks -= L2T_P(q, len);
210 		}
211 		toks += q->tokens;
212 		if (toks > (long)q->buffer)
213 			toks = q->buffer;
214 		toks -= L2T(q, len);
215 
216 		if ((toks|ptoks) >= 0) {
217 			q->t_c = now;
218 			q->tokens = toks;
219 			q->ptokens = ptoks;
220 			sch->q.qlen--;
221 			sch->flags &= ~TCQ_F_THROTTLED;
222 			return skb;
223 		}
224 
225 		qdisc_watchdog_schedule(&q->watchdog,
226 					now + max_t(long, -toks, -ptoks));
227 
228 		/* Maybe we have a shorter packet in the queue,
229 		   which can be sent now. It sounds cool,
230 		   but, however, this is wrong in principle.
231 		   We MUST NOT reorder packets under these circumstances.
232 
233 		   Really, if we split the flow into independent
234 		   subflows, it would be a very good solution.
235 		   This is the main idea of all FQ algorithms
236 		   (cf. CSZ, HPFQ, HFSC)
237 		 */
238 
239 		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
240 			/* When requeue fails skb is dropped */
241 			qdisc_tree_decrease_qlen(q->qdisc, 1);
242 			sch->qstats.drops++;
243 		}
244 
245 		sch->qstats.overlimits++;
246 	}
247 	return NULL;
248 }
249 
250 static void tbf_reset(struct Qdisc* sch)
251 {
252 	struct tbf_sched_data *q = qdisc_priv(sch);
253 
254 	qdisc_reset(q->qdisc);
255 	sch->q.qlen = 0;
256 	q->t_c = psched_get_time();
257 	q->tokens = q->buffer;
258 	q->ptokens = q->mtu;
259 	qdisc_watchdog_cancel(&q->watchdog);
260 }
261 
262 static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
263 {
264 	struct Qdisc *q;
265 	struct rtattr *rta;
266 	int ret;
267 
268 	q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
269 			      TC_H_MAKE(sch->handle, 1));
270 	if (q) {
271 		rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
272 		if (rta) {
273 			rta->rta_type = RTM_NEWQDISC;
274 			rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
275 			((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
276 
277 			ret = q->ops->change(q, rta);
278 			kfree(rta);
279 
280 			if (ret == 0)
281 				return q;
282 		}
283 		qdisc_destroy(q);
284 	}
285 
286 	return NULL;
287 }
288 
289 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
290 {
291 	int err = -EINVAL;
292 	struct tbf_sched_data *q = qdisc_priv(sch);
293 	struct rtattr *tb[TCA_TBF_PTAB];
294 	struct tc_tbf_qopt *qopt;
295 	struct qdisc_rate_table *rtab = NULL;
296 	struct qdisc_rate_table *ptab = NULL;
297 	struct Qdisc *child = NULL;
298 	int max_size,n;
299 
300 	if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
301 	    tb[TCA_TBF_PARMS-1] == NULL ||
302 	    RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
303 		goto done;
304 
305 	qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
306 	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
307 	if (rtab == NULL)
308 		goto done;
309 
310 	if (qopt->peakrate.rate) {
311 		if (qopt->peakrate.rate > qopt->rate.rate)
312 			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
313 		if (ptab == NULL)
314 			goto done;
315 	}
316 
317 	for (n = 0; n < 256; n++)
318 		if (rtab->data[n] > qopt->buffer) break;
319 	max_size = (n << qopt->rate.cell_log)-1;
320 	if (ptab) {
321 		int size;
322 
323 		for (n = 0; n < 256; n++)
324 			if (ptab->data[n] > qopt->mtu) break;
325 		size = (n << qopt->peakrate.cell_log)-1;
326 		if (size < max_size) max_size = size;
327 	}
328 	if (max_size < 0)
329 		goto done;
330 
331 	if (qopt->limit > 0) {
332 		if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
333 			goto done;
334 	}
335 
336 	sch_tree_lock(sch);
337 	if (child) {
338 		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
339 		qdisc_destroy(xchg(&q->qdisc, child));
340 	}
341 	q->limit = qopt->limit;
342 	q->mtu = qopt->mtu;
343 	q->max_size = max_size;
344 	q->buffer = qopt->buffer;
345 	q->tokens = q->buffer;
346 	q->ptokens = q->mtu;
347 	rtab = xchg(&q->R_tab, rtab);
348 	ptab = xchg(&q->P_tab, ptab);
349 	sch_tree_unlock(sch);
350 	err = 0;
351 done:
352 	if (rtab)
353 		qdisc_put_rtab(rtab);
354 	if (ptab)
355 		qdisc_put_rtab(ptab);
356 	return err;
357 }
358 
359 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
360 {
361 	struct tbf_sched_data *q = qdisc_priv(sch);
362 
363 	if (opt == NULL)
364 		return -EINVAL;
365 
366 	q->t_c = psched_get_time();
367 	qdisc_watchdog_init(&q->watchdog, sch);
368 	q->qdisc = &noop_qdisc;
369 
370 	return tbf_change(sch, opt);
371 }
372 
373 static void tbf_destroy(struct Qdisc *sch)
374 {
375 	struct tbf_sched_data *q = qdisc_priv(sch);
376 
377 	qdisc_watchdog_cancel(&q->watchdog);
378 
379 	if (q->P_tab)
380 		qdisc_put_rtab(q->P_tab);
381 	if (q->R_tab)
382 		qdisc_put_rtab(q->R_tab);
383 
384 	qdisc_destroy(q->qdisc);
385 }
386 
387 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
388 {
389 	struct tbf_sched_data *q = qdisc_priv(sch);
390 	unsigned char *b = skb_tail_pointer(skb);
391 	struct rtattr *rta;
392 	struct tc_tbf_qopt opt;
393 
394 	rta = (struct rtattr*)b;
395 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
396 
397 	opt.limit = q->limit;
398 	opt.rate = q->R_tab->rate;
399 	if (q->P_tab)
400 		opt.peakrate = q->P_tab->rate;
401 	else
402 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
403 	opt.mtu = q->mtu;
404 	opt.buffer = q->buffer;
405 	RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
406 	rta->rta_len = skb_tail_pointer(skb) - b;
407 
408 	return skb->len;
409 
410 rtattr_failure:
411 	nlmsg_trim(skb, b);
412 	return -1;
413 }
414 
415 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
416 			  struct sk_buff *skb, struct tcmsg *tcm)
417 {
418 	struct tbf_sched_data *q = qdisc_priv(sch);
419 
420 	if (cl != 1) 	/* only one class */
421 		return -ENOENT;
422 
423 	tcm->tcm_handle |= TC_H_MIN(1);
424 	tcm->tcm_info = q->qdisc->handle;
425 
426 	return 0;
427 }
428 
429 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
430 		     struct Qdisc **old)
431 {
432 	struct tbf_sched_data *q = qdisc_priv(sch);
433 
434 	if (new == NULL)
435 		new = &noop_qdisc;
436 
437 	sch_tree_lock(sch);
438 	*old = xchg(&q->qdisc, new);
439 	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
440 	qdisc_reset(*old);
441 	sch_tree_unlock(sch);
442 
443 	return 0;
444 }
445 
446 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
447 {
448 	struct tbf_sched_data *q = qdisc_priv(sch);
449 	return q->qdisc;
450 }
451 
452 static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
453 {
454 	return 1;
455 }
456 
457 static void tbf_put(struct Qdisc *sch, unsigned long arg)
458 {
459 }
460 
461 static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
462 			    struct rtattr **tca, unsigned long *arg)
463 {
464 	return -ENOSYS;
465 }
466 
467 static int tbf_delete(struct Qdisc *sch, unsigned long arg)
468 {
469 	return -ENOSYS;
470 }
471 
472 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
473 {
474 	if (!walker->stop) {
475 		if (walker->count >= walker->skip)
476 			if (walker->fn(sch, 1, walker) < 0) {
477 				walker->stop = 1;
478 				return;
479 			}
480 		walker->count++;
481 	}
482 }
483 
484 static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
485 {
486 	return NULL;
487 }
488 
489 static struct Qdisc_class_ops tbf_class_ops =
490 {
491 	.graft		=	tbf_graft,
492 	.leaf		=	tbf_leaf,
493 	.get		=	tbf_get,
494 	.put		=	tbf_put,
495 	.change		=	tbf_change_class,
496 	.delete		=	tbf_delete,
497 	.walk		=	tbf_walk,
498 	.tcf_chain	=	tbf_find_tcf,
499 	.dump		=	tbf_dump_class,
500 };
501 
502 static struct Qdisc_ops tbf_qdisc_ops = {
503 	.next		=	NULL,
504 	.cl_ops		=	&tbf_class_ops,
505 	.id		=	"tbf",
506 	.priv_size	=	sizeof(struct tbf_sched_data),
507 	.enqueue	=	tbf_enqueue,
508 	.dequeue	=	tbf_dequeue,
509 	.requeue	=	tbf_requeue,
510 	.drop		=	tbf_drop,
511 	.init		=	tbf_init,
512 	.reset		=	tbf_reset,
513 	.destroy	=	tbf_destroy,
514 	.change		=	tbf_change,
515 	.dump		=	tbf_dump,
516 	.owner		=	THIS_MODULE,
517 };
518 
519 static int __init tbf_module_init(void)
520 {
521 	return register_qdisc(&tbf_qdisc_ops);
522 }
523 
524 static void __exit tbf_module_exit(void)
525 {
526 	unregister_qdisc(&tbf_qdisc_ops);
527 }
528 module_init(tbf_module_init)
529 module_exit(tbf_module_exit)
530 MODULE_LICENSE("GPL");
531