xref: /openbmc/linux/net/sched/sch_tbf.c (revision f7af616c632ee2ac3af0876fe33bf9e0232e665a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_tbf.c	Token Bucket Filter queue.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7  *						 original idea by Martin Devera
8  */
9 
10 #include <linux/module.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/string.h>
14 #include <linux/errno.h>
15 #include <linux/skbuff.h>
16 #include <net/netlink.h>
17 #include <net/sch_generic.h>
18 #include <net/pkt_cls.h>
19 #include <net/pkt_sched.h>
20 
21 
22 /*	Simple Token Bucket Filter.
23 	=======================================
24 
25 	SOURCE.
26 	-------
27 
28 	None.
29 
30 	Description.
31 	------------
32 
33 	A data flow obeys TBF with rate R and depth B, if for any
34 	time interval t_i...t_f the number of transmitted bits
35 	does not exceed B + R*(t_f-t_i).
36 
37 	Packetized version of this definition:
38 	The sequence of packets of sizes s_i served at moments t_i
39 	obeys TBF, if for any i<=k:
40 
41 	s_i+....+s_k <= B + R*(t_k - t_i)
42 
43 	Algorithm.
44 	----------
45 
46 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
47 
48 	N(t+delta) = min{B/R, N(t) + delta}
49 
50 	If the first packet in queue has length S, it may be
51 	transmitted only at the time t_* when S/R <= N(t_*),
52 	and in this case N(t) jumps:
53 
54 	N(t_* + 0) = N(t_* - 0) - S/R.
55 
56 
57 
58 	Actually, QoS requires two TBF to be applied to a data stream.
59 	One of them controls steady state burst size, another
60 	one with rate P (peak rate) and depth M (equal to link MTU)
61 	limits bursts at a smaller time scale.
62 
63 	It is easy to see that P>R, and B>M. If P is infinity, this double
64 	TBF is equivalent to a single one.
65 
66 	When TBF works in reshaping mode, latency is estimated as:
67 
68 	lat = max ((L-B)/R, (L-M)/P)
69 
70 
71 	NOTES.
72 	------
73 
74 	If TBF throttles, it starts a watchdog timer, which will wake it up
75 	when it is ready to transmit.
76 	Note that the minimal timer resolution is 1/HZ.
77 	If no new packets arrive during this period,
78 	or if the device is not awaken by EOI for some previous packet,
79 	TBF can stop its activity for 1/HZ.
80 
81 
82 	This means, that with depth B, the maximal rate is
83 
84 	R_crit = B*HZ
85 
86 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
87 
88 	Note that the peak rate TBF is much more tough: with MTU 1500
89 	P_crit = 150Kbytes/sec. So, if you need greater peak
90 	rates, use alpha with HZ=1000 :-)
91 
92 	With classful TBF, limit is just kept for backwards compatibility.
93 	It is passed to the default bfifo qdisc - if the inner qdisc is
94 	changed the limit is not effective anymore.
95 */
96 
97 struct tbf_sched_data {
98 /* Parameters */
99 	u32		limit;		/* Maximal length of backlog: bytes */
100 	u32		max_size;
101 	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
102 	s64		mtu;
103 	struct psched_ratecfg rate;
104 	struct psched_ratecfg peak;
105 
106 /* Variables */
107 	s64	tokens;			/* Current number of B tokens */
108 	s64	ptokens;		/* Current number of P tokens */
109 	s64	t_c;			/* Time check-point */
110 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
111 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
112 };
113 
114 
115 /* Time to Length, convert time in ns to length in bytes
116  * to determinate how many bytes can be sent in given time.
117  */
118 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
119 			 u64 time_in_ns)
120 {
121 	/* The formula is :
122 	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
123 	 */
124 	u64 len = time_in_ns * r->rate_bytes_ps;
125 
126 	do_div(len, NSEC_PER_SEC);
127 
128 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
129 		do_div(len, 53);
130 		len = len * 48;
131 	}
132 
133 	if (len > r->overhead)
134 		len -= r->overhead;
135 	else
136 		len = 0;
137 
138 	return len;
139 }
140 
141 static void tbf_offload_change(struct Qdisc *sch)
142 {
143 	struct tbf_sched_data *q = qdisc_priv(sch);
144 	struct net_device *dev = qdisc_dev(sch);
145 	struct tc_tbf_qopt_offload qopt;
146 
147 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
148 		return;
149 
150 	qopt.command = TC_TBF_REPLACE;
151 	qopt.handle = sch->handle;
152 	qopt.parent = sch->parent;
153 	qopt.replace_params.rate = q->rate;
154 	qopt.replace_params.max_size = q->max_size;
155 	qopt.replace_params.qstats = &sch->qstats;
156 
157 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
158 }
159 
160 static void tbf_offload_destroy(struct Qdisc *sch)
161 {
162 	struct net_device *dev = qdisc_dev(sch);
163 	struct tc_tbf_qopt_offload qopt;
164 
165 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
166 		return;
167 
168 	qopt.command = TC_TBF_DESTROY;
169 	qopt.handle = sch->handle;
170 	qopt.parent = sch->parent;
171 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
172 }
173 
174 static int tbf_offload_dump(struct Qdisc *sch)
175 {
176 	struct tc_tbf_qopt_offload qopt;
177 
178 	qopt.command = TC_TBF_STATS;
179 	qopt.handle = sch->handle;
180 	qopt.parent = sch->parent;
181 	qopt.stats.bstats = &sch->bstats;
182 	qopt.stats.qstats = &sch->qstats;
183 
184 	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
185 }
186 
187 /* GSO packet is too big, segment it so that tbf can transmit
188  * each segment in time
189  */
190 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
191 		       struct sk_buff **to_free)
192 {
193 	struct tbf_sched_data *q = qdisc_priv(sch);
194 	struct sk_buff *segs, *nskb;
195 	netdev_features_t features = netif_skb_features(skb);
196 	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
197 	int ret, nb;
198 
199 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
200 
201 	if (IS_ERR_OR_NULL(segs))
202 		return qdisc_drop(skb, sch, to_free);
203 
204 	nb = 0;
205 	skb_list_walk_safe(segs, segs, nskb) {
206 		skb_mark_not_on_list(segs);
207 		qdisc_skb_cb(segs)->pkt_len = segs->len;
208 		len += segs->len;
209 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
210 		if (ret != NET_XMIT_SUCCESS) {
211 			if (net_xmit_drop_count(ret))
212 				qdisc_qstats_drop(sch);
213 		} else {
214 			nb++;
215 		}
216 	}
217 	sch->q.qlen += nb;
218 	if (nb > 1)
219 		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
220 	consume_skb(skb);
221 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
222 }
223 
224 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
225 		       struct sk_buff **to_free)
226 {
227 	struct tbf_sched_data *q = qdisc_priv(sch);
228 	unsigned int len = qdisc_pkt_len(skb);
229 	int ret;
230 
231 	if (qdisc_pkt_len(skb) > q->max_size) {
232 		if (skb_is_gso(skb) &&
233 		    skb_gso_validate_mac_len(skb, q->max_size))
234 			return tbf_segment(skb, sch, to_free);
235 		return qdisc_drop(skb, sch, to_free);
236 	}
237 	ret = qdisc_enqueue(skb, q->qdisc, to_free);
238 	if (ret != NET_XMIT_SUCCESS) {
239 		if (net_xmit_drop_count(ret))
240 			qdisc_qstats_drop(sch);
241 		return ret;
242 	}
243 
244 	sch->qstats.backlog += len;
245 	sch->q.qlen++;
246 	return NET_XMIT_SUCCESS;
247 }
248 
249 static bool tbf_peak_present(const struct tbf_sched_data *q)
250 {
251 	return q->peak.rate_bytes_ps;
252 }
253 
254 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
255 {
256 	struct tbf_sched_data *q = qdisc_priv(sch);
257 	struct sk_buff *skb;
258 
259 	skb = q->qdisc->ops->peek(q->qdisc);
260 
261 	if (skb) {
262 		s64 now;
263 		s64 toks;
264 		s64 ptoks = 0;
265 		unsigned int len = qdisc_pkt_len(skb);
266 
267 		now = ktime_get_ns();
268 		toks = min_t(s64, now - q->t_c, q->buffer);
269 
270 		if (tbf_peak_present(q)) {
271 			ptoks = toks + q->ptokens;
272 			if (ptoks > q->mtu)
273 				ptoks = q->mtu;
274 			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
275 		}
276 		toks += q->tokens;
277 		if (toks > q->buffer)
278 			toks = q->buffer;
279 		toks -= (s64) psched_l2t_ns(&q->rate, len);
280 
281 		if ((toks|ptoks) >= 0) {
282 			skb = qdisc_dequeue_peeked(q->qdisc);
283 			if (unlikely(!skb))
284 				return NULL;
285 
286 			q->t_c = now;
287 			q->tokens = toks;
288 			q->ptokens = ptoks;
289 			qdisc_qstats_backlog_dec(sch, skb);
290 			sch->q.qlen--;
291 			qdisc_bstats_update(sch, skb);
292 			return skb;
293 		}
294 
295 		qdisc_watchdog_schedule_ns(&q->watchdog,
296 					   now + max_t(long, -toks, -ptoks));
297 
298 		/* Maybe we have a shorter packet in the queue,
299 		   which can be sent now. It sounds cool,
300 		   but, however, this is wrong in principle.
301 		   We MUST NOT reorder packets under these circumstances.
302 
303 		   Really, if we split the flow into independent
304 		   subflows, it would be a very good solution.
305 		   This is the main idea of all FQ algorithms
306 		   (cf. CSZ, HPFQ, HFSC)
307 		 */
308 
309 		qdisc_qstats_overlimit(sch);
310 	}
311 	return NULL;
312 }
313 
314 static void tbf_reset(struct Qdisc *sch)
315 {
316 	struct tbf_sched_data *q = qdisc_priv(sch);
317 
318 	qdisc_reset(q->qdisc);
319 	sch->qstats.backlog = 0;
320 	sch->q.qlen = 0;
321 	q->t_c = ktime_get_ns();
322 	q->tokens = q->buffer;
323 	q->ptokens = q->mtu;
324 	qdisc_watchdog_cancel(&q->watchdog);
325 }
326 
327 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
328 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
329 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
330 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
331 	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
332 	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
333 	[TCA_TBF_BURST] = { .type = NLA_U32 },
334 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
335 };
336 
337 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
338 		      struct netlink_ext_ack *extack)
339 {
340 	int err;
341 	struct tbf_sched_data *q = qdisc_priv(sch);
342 	struct nlattr *tb[TCA_TBF_MAX + 1];
343 	struct tc_tbf_qopt *qopt;
344 	struct Qdisc *child = NULL;
345 	struct psched_ratecfg rate;
346 	struct psched_ratecfg peak;
347 	u64 max_size;
348 	s64 buffer, mtu;
349 	u64 rate64 = 0, prate64 = 0;
350 
351 	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
352 					  NULL);
353 	if (err < 0)
354 		return err;
355 
356 	err = -EINVAL;
357 	if (tb[TCA_TBF_PARMS] == NULL)
358 		goto done;
359 
360 	qopt = nla_data(tb[TCA_TBF_PARMS]);
361 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
362 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
363 					      tb[TCA_TBF_RTAB],
364 					      NULL));
365 
366 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
367 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
368 						      tb[TCA_TBF_PTAB],
369 						      NULL));
370 
371 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
372 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
373 
374 	if (tb[TCA_TBF_RATE64])
375 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
376 	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
377 
378 	if (tb[TCA_TBF_BURST]) {
379 		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
380 		buffer = psched_l2t_ns(&rate, max_size);
381 	} else {
382 		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
383 	}
384 
385 	if (qopt->peakrate.rate) {
386 		if (tb[TCA_TBF_PRATE64])
387 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
388 		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
389 		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
390 			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
391 					peak.rate_bytes_ps, rate.rate_bytes_ps);
392 			err = -EINVAL;
393 			goto done;
394 		}
395 
396 		if (tb[TCA_TBF_PBURST]) {
397 			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
398 			max_size = min_t(u32, max_size, pburst);
399 			mtu = psched_l2t_ns(&peak, pburst);
400 		} else {
401 			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
402 		}
403 	} else {
404 		memset(&peak, 0, sizeof(peak));
405 	}
406 
407 	if (max_size < psched_mtu(qdisc_dev(sch)))
408 		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
409 				    max_size, qdisc_dev(sch)->name,
410 				    psched_mtu(qdisc_dev(sch)));
411 
412 	if (!max_size) {
413 		err = -EINVAL;
414 		goto done;
415 	}
416 
417 	if (q->qdisc != &noop_qdisc) {
418 		err = fifo_set_limit(q->qdisc, qopt->limit);
419 		if (err)
420 			goto done;
421 	} else if (qopt->limit > 0) {
422 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
423 					 extack);
424 		if (IS_ERR(child)) {
425 			err = PTR_ERR(child);
426 			goto done;
427 		}
428 
429 		/* child is fifo, no need to check for noop_qdisc */
430 		qdisc_hash_add(child, true);
431 	}
432 
433 	sch_tree_lock(sch);
434 	if (child) {
435 		qdisc_tree_flush_backlog(q->qdisc);
436 		qdisc_put(q->qdisc);
437 		q->qdisc = child;
438 	}
439 	q->limit = qopt->limit;
440 	if (tb[TCA_TBF_PBURST])
441 		q->mtu = mtu;
442 	else
443 		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
444 	q->max_size = max_size;
445 	if (tb[TCA_TBF_BURST])
446 		q->buffer = buffer;
447 	else
448 		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
449 	q->tokens = q->buffer;
450 	q->ptokens = q->mtu;
451 
452 	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
453 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
454 
455 	sch_tree_unlock(sch);
456 	err = 0;
457 
458 	tbf_offload_change(sch);
459 done:
460 	return err;
461 }
462 
463 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
464 		    struct netlink_ext_ack *extack)
465 {
466 	struct tbf_sched_data *q = qdisc_priv(sch);
467 
468 	qdisc_watchdog_init(&q->watchdog, sch);
469 	q->qdisc = &noop_qdisc;
470 
471 	if (!opt)
472 		return -EINVAL;
473 
474 	q->t_c = ktime_get_ns();
475 
476 	return tbf_change(sch, opt, extack);
477 }
478 
479 static void tbf_destroy(struct Qdisc *sch)
480 {
481 	struct tbf_sched_data *q = qdisc_priv(sch);
482 
483 	qdisc_watchdog_cancel(&q->watchdog);
484 	tbf_offload_destroy(sch);
485 	qdisc_put(q->qdisc);
486 }
487 
488 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
489 {
490 	struct tbf_sched_data *q = qdisc_priv(sch);
491 	struct nlattr *nest;
492 	struct tc_tbf_qopt opt;
493 	int err;
494 
495 	err = tbf_offload_dump(sch);
496 	if (err)
497 		return err;
498 
499 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
500 	if (nest == NULL)
501 		goto nla_put_failure;
502 
503 	opt.limit = q->limit;
504 	psched_ratecfg_getrate(&opt.rate, &q->rate);
505 	if (tbf_peak_present(q))
506 		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
507 	else
508 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
509 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
510 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
511 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
512 		goto nla_put_failure;
513 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
514 	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
515 			      TCA_TBF_PAD))
516 		goto nla_put_failure;
517 	if (tbf_peak_present(q) &&
518 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
519 	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
520 			      TCA_TBF_PAD))
521 		goto nla_put_failure;
522 
523 	return nla_nest_end(skb, nest);
524 
525 nla_put_failure:
526 	nla_nest_cancel(skb, nest);
527 	return -1;
528 }
529 
530 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
531 			  struct sk_buff *skb, struct tcmsg *tcm)
532 {
533 	struct tbf_sched_data *q = qdisc_priv(sch);
534 
535 	tcm->tcm_handle |= TC_H_MIN(1);
536 	tcm->tcm_info = q->qdisc->handle;
537 
538 	return 0;
539 }
540 
541 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
542 		     struct Qdisc **old, struct netlink_ext_ack *extack)
543 {
544 	struct tbf_sched_data *q = qdisc_priv(sch);
545 
546 	if (new == NULL)
547 		new = &noop_qdisc;
548 
549 	*old = qdisc_replace(sch, new, &q->qdisc);
550 	return 0;
551 }
552 
553 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
554 {
555 	struct tbf_sched_data *q = qdisc_priv(sch);
556 	return q->qdisc;
557 }
558 
559 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
560 {
561 	return 1;
562 }
563 
564 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
565 {
566 	if (!walker->stop) {
567 		if (walker->count >= walker->skip)
568 			if (walker->fn(sch, 1, walker) < 0) {
569 				walker->stop = 1;
570 				return;
571 			}
572 		walker->count++;
573 	}
574 }
575 
576 static const struct Qdisc_class_ops tbf_class_ops = {
577 	.graft		=	tbf_graft,
578 	.leaf		=	tbf_leaf,
579 	.find		=	tbf_find,
580 	.walk		=	tbf_walk,
581 	.dump		=	tbf_dump_class,
582 };
583 
584 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
585 	.next		=	NULL,
586 	.cl_ops		=	&tbf_class_ops,
587 	.id		=	"tbf",
588 	.priv_size	=	sizeof(struct tbf_sched_data),
589 	.enqueue	=	tbf_enqueue,
590 	.dequeue	=	tbf_dequeue,
591 	.peek		=	qdisc_peek_dequeued,
592 	.init		=	tbf_init,
593 	.reset		=	tbf_reset,
594 	.destroy	=	tbf_destroy,
595 	.change		=	tbf_change,
596 	.dump		=	tbf_dump,
597 	.owner		=	THIS_MODULE,
598 };
599 
600 static int __init tbf_module_init(void)
601 {
602 	return register_qdisc(&tbf_qdisc_ops);
603 }
604 
605 static void __exit tbf_module_exit(void)
606 {
607 	unregister_qdisc(&tbf_qdisc_ops);
608 }
609 module_init(tbf_module_init)
610 module_exit(tbf_module_exit)
611 MODULE_LICENSE("GPL");
612