xref: /openbmc/linux/net/sched/sch_tbf.c (revision 45cc842d5b75ba8f9a958f2dd12b95c6dd0452bd)
1 /*
2  * net/sched/sch_tbf.c	Token Bucket Filter queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11  *						 original idea by Martin Devera
12  *
13  */
14 
15 #include <linux/module.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/string.h>
19 #include <linux/errno.h>
20 #include <linux/skbuff.h>
21 #include <net/netlink.h>
22 #include <net/sch_generic.h>
23 #include <net/pkt_sched.h>
24 
25 
26 /*	Simple Token Bucket Filter.
27 	=======================================
28 
29 	SOURCE.
30 	-------
31 
32 	None.
33 
34 	Description.
35 	------------
36 
37 	A data flow obeys TBF with rate R and depth B, if for any
38 	time interval t_i...t_f the number of transmitted bits
39 	does not exceed B + R*(t_f-t_i).
40 
41 	Packetized version of this definition:
42 	The sequence of packets of sizes s_i served at moments t_i
43 	obeys TBF, if for any i<=k:
44 
45 	s_i+....+s_k <= B + R*(t_k - t_i)
46 
47 	Algorithm.
48 	----------
49 
50 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
51 
52 	N(t+delta) = min{B/R, N(t) + delta}
53 
54 	If the first packet in queue has length S, it may be
55 	transmitted only at the time t_* when S/R <= N(t_*),
56 	and in this case N(t) jumps:
57 
58 	N(t_* + 0) = N(t_* - 0) - S/R.
59 
60 
61 
62 	Actually, QoS requires two TBF to be applied to a data stream.
63 	One of them controls steady state burst size, another
64 	one with rate P (peak rate) and depth M (equal to link MTU)
65 	limits bursts at a smaller time scale.
66 
67 	It is easy to see that P>R, and B>M. If P is infinity, this double
68 	TBF is equivalent to a single one.
69 
70 	When TBF works in reshaping mode, latency is estimated as:
71 
72 	lat = max ((L-B)/R, (L-M)/P)
73 
74 
75 	NOTES.
76 	------
77 
78 	If TBF throttles, it starts a watchdog timer, which will wake it up
79 	when it is ready to transmit.
80 	Note that the minimal timer resolution is 1/HZ.
81 	If no new packets arrive during this period,
82 	or if the device is not awaken by EOI for some previous packet,
83 	TBF can stop its activity for 1/HZ.
84 
85 
86 	This means, that with depth B, the maximal rate is
87 
88 	R_crit = B*HZ
89 
90 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91 
92 	Note that the peak rate TBF is much more tough: with MTU 1500
93 	P_crit = 150Kbytes/sec. So, if you need greater peak
94 	rates, use alpha with HZ=1000 :-)
95 
96 	With classful TBF, limit is just kept for backwards compatibility.
97 	It is passed to the default bfifo qdisc - if the inner qdisc is
98 	changed the limit is not effective anymore.
99 */
100 
101 struct tbf_sched_data {
102 /* Parameters */
103 	u32		limit;		/* Maximal length of backlog: bytes */
104 	u32		max_size;
105 	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
106 	s64		mtu;
107 	struct psched_ratecfg rate;
108 	struct psched_ratecfg peak;
109 
110 /* Variables */
111 	s64	tokens;			/* Current number of B tokens */
112 	s64	ptokens;		/* Current number of P tokens */
113 	s64	t_c;			/* Time check-point */
114 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116 };
117 
118 
119 /* Time to Length, convert time in ns to length in bytes
120  * to determinate how many bytes can be sent in given time.
121  */
122 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
123 			 u64 time_in_ns)
124 {
125 	/* The formula is :
126 	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
127 	 */
128 	u64 len = time_in_ns * r->rate_bytes_ps;
129 
130 	do_div(len, NSEC_PER_SEC);
131 
132 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
133 		do_div(len, 53);
134 		len = len * 48;
135 	}
136 
137 	if (len > r->overhead)
138 		len -= r->overhead;
139 	else
140 		len = 0;
141 
142 	return len;
143 }
144 
145 /* GSO packet is too big, segment it so that tbf can transmit
146  * each segment in time
147  */
148 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
149 		       struct sk_buff **to_free)
150 {
151 	struct tbf_sched_data *q = qdisc_priv(sch);
152 	struct sk_buff *segs, *nskb;
153 	netdev_features_t features = netif_skb_features(skb);
154 	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
155 	int ret, nb;
156 
157 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
158 
159 	if (IS_ERR_OR_NULL(segs))
160 		return qdisc_drop(skb, sch, to_free);
161 
162 	nb = 0;
163 	while (segs) {
164 		nskb = segs->next;
165 		segs->next = NULL;
166 		qdisc_skb_cb(segs)->pkt_len = segs->len;
167 		len += segs->len;
168 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
169 		if (ret != NET_XMIT_SUCCESS) {
170 			if (net_xmit_drop_count(ret))
171 				qdisc_qstats_drop(sch);
172 		} else {
173 			nb++;
174 		}
175 		segs = nskb;
176 	}
177 	sch->q.qlen += nb;
178 	if (nb > 1)
179 		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
180 	consume_skb(skb);
181 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
182 }
183 
184 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
185 		       struct sk_buff **to_free)
186 {
187 	struct tbf_sched_data *q = qdisc_priv(sch);
188 	int ret;
189 
190 	if (qdisc_pkt_len(skb) > q->max_size) {
191 		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
192 			return tbf_segment(skb, sch, to_free);
193 		return qdisc_drop(skb, sch, to_free);
194 	}
195 	ret = qdisc_enqueue(skb, q->qdisc, to_free);
196 	if (ret != NET_XMIT_SUCCESS) {
197 		if (net_xmit_drop_count(ret))
198 			qdisc_qstats_drop(sch);
199 		return ret;
200 	}
201 
202 	qdisc_qstats_backlog_inc(sch, skb);
203 	sch->q.qlen++;
204 	return NET_XMIT_SUCCESS;
205 }
206 
207 static bool tbf_peak_present(const struct tbf_sched_data *q)
208 {
209 	return q->peak.rate_bytes_ps;
210 }
211 
212 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
213 {
214 	struct tbf_sched_data *q = qdisc_priv(sch);
215 	struct sk_buff *skb;
216 
217 	skb = q->qdisc->ops->peek(q->qdisc);
218 
219 	if (skb) {
220 		s64 now;
221 		s64 toks;
222 		s64 ptoks = 0;
223 		unsigned int len = qdisc_pkt_len(skb);
224 
225 		now = ktime_get_ns();
226 		toks = min_t(s64, now - q->t_c, q->buffer);
227 
228 		if (tbf_peak_present(q)) {
229 			ptoks = toks + q->ptokens;
230 			if (ptoks > q->mtu)
231 				ptoks = q->mtu;
232 			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
233 		}
234 		toks += q->tokens;
235 		if (toks > q->buffer)
236 			toks = q->buffer;
237 		toks -= (s64) psched_l2t_ns(&q->rate, len);
238 
239 		if ((toks|ptoks) >= 0) {
240 			skb = qdisc_dequeue_peeked(q->qdisc);
241 			if (unlikely(!skb))
242 				return NULL;
243 
244 			q->t_c = now;
245 			q->tokens = toks;
246 			q->ptokens = ptoks;
247 			qdisc_qstats_backlog_dec(sch, skb);
248 			sch->q.qlen--;
249 			qdisc_bstats_update(sch, skb);
250 			return skb;
251 		}
252 
253 		qdisc_watchdog_schedule_ns(&q->watchdog,
254 					   now + max_t(long, -toks, -ptoks));
255 
256 		/* Maybe we have a shorter packet in the queue,
257 		   which can be sent now. It sounds cool,
258 		   but, however, this is wrong in principle.
259 		   We MUST NOT reorder packets under these circumstances.
260 
261 		   Really, if we split the flow into independent
262 		   subflows, it would be a very good solution.
263 		   This is the main idea of all FQ algorithms
264 		   (cf. CSZ, HPFQ, HFSC)
265 		 */
266 
267 		qdisc_qstats_overlimit(sch);
268 	}
269 	return NULL;
270 }
271 
272 static void tbf_reset(struct Qdisc *sch)
273 {
274 	struct tbf_sched_data *q = qdisc_priv(sch);
275 
276 	qdisc_reset(q->qdisc);
277 	sch->qstats.backlog = 0;
278 	sch->q.qlen = 0;
279 	q->t_c = ktime_get_ns();
280 	q->tokens = q->buffer;
281 	q->ptokens = q->mtu;
282 	qdisc_watchdog_cancel(&q->watchdog);
283 }
284 
285 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
286 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
287 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
288 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
289 	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
290 	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
291 	[TCA_TBF_BURST] = { .type = NLA_U32 },
292 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
293 };
294 
295 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
296 		      struct netlink_ext_ack *extack)
297 {
298 	int err;
299 	struct tbf_sched_data *q = qdisc_priv(sch);
300 	struct nlattr *tb[TCA_TBF_MAX + 1];
301 	struct tc_tbf_qopt *qopt;
302 	struct Qdisc *child = NULL;
303 	struct psched_ratecfg rate;
304 	struct psched_ratecfg peak;
305 	u64 max_size;
306 	s64 buffer, mtu;
307 	u64 rate64 = 0, prate64 = 0;
308 
309 	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
310 	if (err < 0)
311 		return err;
312 
313 	err = -EINVAL;
314 	if (tb[TCA_TBF_PARMS] == NULL)
315 		goto done;
316 
317 	qopt = nla_data(tb[TCA_TBF_PARMS]);
318 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
319 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
320 					      tb[TCA_TBF_RTAB],
321 					      NULL));
322 
323 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
324 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
325 						      tb[TCA_TBF_PTAB],
326 						      NULL));
327 
328 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
329 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
330 
331 	if (tb[TCA_TBF_RATE64])
332 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
333 	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
334 
335 	if (tb[TCA_TBF_BURST]) {
336 		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
337 		buffer = psched_l2t_ns(&rate, max_size);
338 	} else {
339 		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
340 	}
341 
342 	if (qopt->peakrate.rate) {
343 		if (tb[TCA_TBF_PRATE64])
344 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
345 		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
346 		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
347 			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
348 					peak.rate_bytes_ps, rate.rate_bytes_ps);
349 			err = -EINVAL;
350 			goto done;
351 		}
352 
353 		if (tb[TCA_TBF_PBURST]) {
354 			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
355 			max_size = min_t(u32, max_size, pburst);
356 			mtu = psched_l2t_ns(&peak, pburst);
357 		} else {
358 			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
359 		}
360 	} else {
361 		memset(&peak, 0, sizeof(peak));
362 	}
363 
364 	if (max_size < psched_mtu(qdisc_dev(sch)))
365 		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
366 				    max_size, qdisc_dev(sch)->name,
367 				    psched_mtu(qdisc_dev(sch)));
368 
369 	if (!max_size) {
370 		err = -EINVAL;
371 		goto done;
372 	}
373 
374 	if (q->qdisc != &noop_qdisc) {
375 		err = fifo_set_limit(q->qdisc, qopt->limit);
376 		if (err)
377 			goto done;
378 	} else if (qopt->limit > 0) {
379 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
380 					 extack);
381 		if (IS_ERR(child)) {
382 			err = PTR_ERR(child);
383 			goto done;
384 		}
385 	}
386 
387 	sch_tree_lock(sch);
388 	if (child) {
389 		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
390 					  q->qdisc->qstats.backlog);
391 		qdisc_destroy(q->qdisc);
392 		q->qdisc = child;
393 		if (child != &noop_qdisc)
394 			qdisc_hash_add(child, true);
395 	}
396 	q->limit = qopt->limit;
397 	if (tb[TCA_TBF_PBURST])
398 		q->mtu = mtu;
399 	else
400 		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
401 	q->max_size = max_size;
402 	if (tb[TCA_TBF_BURST])
403 		q->buffer = buffer;
404 	else
405 		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
406 	q->tokens = q->buffer;
407 	q->ptokens = q->mtu;
408 
409 	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
410 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
411 
412 	sch_tree_unlock(sch);
413 	err = 0;
414 done:
415 	return err;
416 }
417 
418 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
419 		    struct netlink_ext_ack *extack)
420 {
421 	struct tbf_sched_data *q = qdisc_priv(sch);
422 
423 	qdisc_watchdog_init(&q->watchdog, sch);
424 	q->qdisc = &noop_qdisc;
425 
426 	if (!opt)
427 		return -EINVAL;
428 
429 	q->t_c = ktime_get_ns();
430 
431 	return tbf_change(sch, opt, extack);
432 }
433 
434 static void tbf_destroy(struct Qdisc *sch)
435 {
436 	struct tbf_sched_data *q = qdisc_priv(sch);
437 
438 	qdisc_watchdog_cancel(&q->watchdog);
439 	qdisc_destroy(q->qdisc);
440 }
441 
442 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
443 {
444 	struct tbf_sched_data *q = qdisc_priv(sch);
445 	struct nlattr *nest;
446 	struct tc_tbf_qopt opt;
447 
448 	sch->qstats.backlog = q->qdisc->qstats.backlog;
449 	nest = nla_nest_start(skb, TCA_OPTIONS);
450 	if (nest == NULL)
451 		goto nla_put_failure;
452 
453 	opt.limit = q->limit;
454 	psched_ratecfg_getrate(&opt.rate, &q->rate);
455 	if (tbf_peak_present(q))
456 		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
457 	else
458 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
459 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
460 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
461 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
462 		goto nla_put_failure;
463 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
464 	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
465 			      TCA_TBF_PAD))
466 		goto nla_put_failure;
467 	if (tbf_peak_present(q) &&
468 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
469 	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
470 			      TCA_TBF_PAD))
471 		goto nla_put_failure;
472 
473 	return nla_nest_end(skb, nest);
474 
475 nla_put_failure:
476 	nla_nest_cancel(skb, nest);
477 	return -1;
478 }
479 
480 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
481 			  struct sk_buff *skb, struct tcmsg *tcm)
482 {
483 	struct tbf_sched_data *q = qdisc_priv(sch);
484 
485 	tcm->tcm_handle |= TC_H_MIN(1);
486 	tcm->tcm_info = q->qdisc->handle;
487 
488 	return 0;
489 }
490 
491 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
492 		     struct Qdisc **old, struct netlink_ext_ack *extack)
493 {
494 	struct tbf_sched_data *q = qdisc_priv(sch);
495 
496 	if (new == NULL)
497 		new = &noop_qdisc;
498 
499 	*old = qdisc_replace(sch, new, &q->qdisc);
500 	return 0;
501 }
502 
503 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
504 {
505 	struct tbf_sched_data *q = qdisc_priv(sch);
506 	return q->qdisc;
507 }
508 
509 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
510 {
511 	return 1;
512 }
513 
514 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
515 {
516 	if (!walker->stop) {
517 		if (walker->count >= walker->skip)
518 			if (walker->fn(sch, 1, walker) < 0) {
519 				walker->stop = 1;
520 				return;
521 			}
522 		walker->count++;
523 	}
524 }
525 
526 static const struct Qdisc_class_ops tbf_class_ops = {
527 	.graft		=	tbf_graft,
528 	.leaf		=	tbf_leaf,
529 	.find		=	tbf_find,
530 	.walk		=	tbf_walk,
531 	.dump		=	tbf_dump_class,
532 };
533 
534 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
535 	.next		=	NULL,
536 	.cl_ops		=	&tbf_class_ops,
537 	.id		=	"tbf",
538 	.priv_size	=	sizeof(struct tbf_sched_data),
539 	.enqueue	=	tbf_enqueue,
540 	.dequeue	=	tbf_dequeue,
541 	.peek		=	qdisc_peek_dequeued,
542 	.init		=	tbf_init,
543 	.reset		=	tbf_reset,
544 	.destroy	=	tbf_destroy,
545 	.change		=	tbf_change,
546 	.dump		=	tbf_dump,
547 	.owner		=	THIS_MODULE,
548 };
549 
550 static int __init tbf_module_init(void)
551 {
552 	return register_qdisc(&tbf_qdisc_ops);
553 }
554 
555 static void __exit tbf_module_exit(void)
556 {
557 	unregister_qdisc(&tbf_qdisc_ops);
558 }
559 module_init(tbf_module_init)
560 module_exit(tbf_module_exit)
561 MODULE_LICENSE("GPL");
562