xref: /openbmc/linux/net/sched/sch_tbf.c (revision 3286f88f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_tbf.c	Token Bucket Filter queue.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7  *						 original idea by Martin Devera
8  */
9 
10 #include <linux/module.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/string.h>
14 #include <linux/errno.h>
15 #include <linux/skbuff.h>
16 #include <net/gso.h>
17 #include <net/netlink.h>
18 #include <net/sch_generic.h>
19 #include <net/pkt_cls.h>
20 #include <net/pkt_sched.h>
21 
22 
23 /*	Simple Token Bucket Filter.
24 	=======================================
25 
26 	SOURCE.
27 	-------
28 
29 	None.
30 
31 	Description.
32 	------------
33 
34 	A data flow obeys TBF with rate R and depth B, if for any
35 	time interval t_i...t_f the number of transmitted bits
36 	does not exceed B + R*(t_f-t_i).
37 
38 	Packetized version of this definition:
39 	The sequence of packets of sizes s_i served at moments t_i
40 	obeys TBF, if for any i<=k:
41 
42 	s_i+....+s_k <= B + R*(t_k - t_i)
43 
44 	Algorithm.
45 	----------
46 
47 	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
48 
49 	N(t+delta) = min{B/R, N(t) + delta}
50 
51 	If the first packet in queue has length S, it may be
52 	transmitted only at the time t_* when S/R <= N(t_*),
53 	and in this case N(t) jumps:
54 
55 	N(t_* + 0) = N(t_* - 0) - S/R.
56 
57 
58 
59 	Actually, QoS requires two TBF to be applied to a data stream.
60 	One of them controls steady state burst size, another
61 	one with rate P (peak rate) and depth M (equal to link MTU)
62 	limits bursts at a smaller time scale.
63 
64 	It is easy to see that P>R, and B>M. If P is infinity, this double
65 	TBF is equivalent to a single one.
66 
67 	When TBF works in reshaping mode, latency is estimated as:
68 
69 	lat = max ((L-B)/R, (L-M)/P)
70 
71 
72 	NOTES.
73 	------
74 
75 	If TBF throttles, it starts a watchdog timer, which will wake it up
76 	when it is ready to transmit.
77 	Note that the minimal timer resolution is 1/HZ.
78 	If no new packets arrive during this period,
79 	or if the device is not awaken by EOI for some previous packet,
80 	TBF can stop its activity for 1/HZ.
81 
82 
83 	This means, that with depth B, the maximal rate is
84 
85 	R_crit = B*HZ
86 
87 	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
88 
89 	Note that the peak rate TBF is much more tough: with MTU 1500
90 	P_crit = 150Kbytes/sec. So, if you need greater peak
91 	rates, use alpha with HZ=1000 :-)
92 
93 	With classful TBF, limit is just kept for backwards compatibility.
94 	It is passed to the default bfifo qdisc - if the inner qdisc is
95 	changed the limit is not effective anymore.
96 */
97 
98 struct tbf_sched_data {
99 /* Parameters */
100 	u32		limit;		/* Maximal length of backlog: bytes */
101 	u32		max_size;
102 	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
103 	s64		mtu;
104 	struct psched_ratecfg rate;
105 	struct psched_ratecfg peak;
106 
107 /* Variables */
108 	s64	tokens;			/* Current number of B tokens */
109 	s64	ptokens;		/* Current number of P tokens */
110 	s64	t_c;			/* Time check-point */
111 	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
112 	struct qdisc_watchdog watchdog;	/* Watchdog timer */
113 };
114 
115 
116 /* Time to Length, convert time in ns to length in bytes
117  * to determinate how many bytes can be sent in given time.
118  */
119 static u64 psched_ns_t2l(const struct psched_ratecfg *r,
120 			 u64 time_in_ns)
121 {
122 	/* The formula is :
123 	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
124 	 */
125 	u64 len = time_in_ns * r->rate_bytes_ps;
126 
127 	do_div(len, NSEC_PER_SEC);
128 
129 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
130 		do_div(len, 53);
131 		len = len * 48;
132 	}
133 
134 	if (len > r->overhead)
135 		len -= r->overhead;
136 	else
137 		len = 0;
138 
139 	return len;
140 }
141 
142 static void tbf_offload_change(struct Qdisc *sch)
143 {
144 	struct tbf_sched_data *q = qdisc_priv(sch);
145 	struct net_device *dev = qdisc_dev(sch);
146 	struct tc_tbf_qopt_offload qopt;
147 
148 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
149 		return;
150 
151 	qopt.command = TC_TBF_REPLACE;
152 	qopt.handle = sch->handle;
153 	qopt.parent = sch->parent;
154 	qopt.replace_params.rate = q->rate;
155 	qopt.replace_params.max_size = q->max_size;
156 	qopt.replace_params.qstats = &sch->qstats;
157 
158 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
159 }
160 
161 static void tbf_offload_destroy(struct Qdisc *sch)
162 {
163 	struct net_device *dev = qdisc_dev(sch);
164 	struct tc_tbf_qopt_offload qopt;
165 
166 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
167 		return;
168 
169 	qopt.command = TC_TBF_DESTROY;
170 	qopt.handle = sch->handle;
171 	qopt.parent = sch->parent;
172 	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
173 }
174 
175 static int tbf_offload_dump(struct Qdisc *sch)
176 {
177 	struct tc_tbf_qopt_offload qopt;
178 
179 	qopt.command = TC_TBF_STATS;
180 	qopt.handle = sch->handle;
181 	qopt.parent = sch->parent;
182 	qopt.stats.bstats = &sch->bstats;
183 	qopt.stats.qstats = &sch->qstats;
184 
185 	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
186 }
187 
188 static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new,
189 			      struct Qdisc *old, struct netlink_ext_ack *extack)
190 {
191 	struct tc_tbf_qopt_offload graft_offload = {
192 		.handle		= sch->handle,
193 		.parent		= sch->parent,
194 		.child_handle	= new->handle,
195 		.command	= TC_TBF_GRAFT,
196 	};
197 
198 	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old,
199 				   TC_SETUP_QDISC_TBF, &graft_offload, extack);
200 }
201 
202 /* GSO packet is too big, segment it so that tbf can transmit
203  * each segment in time
204  */
205 static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
206 		       struct sk_buff **to_free)
207 {
208 	struct tbf_sched_data *q = qdisc_priv(sch);
209 	struct sk_buff *segs, *nskb;
210 	netdev_features_t features = netif_skb_features(skb);
211 	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
212 	int ret, nb;
213 
214 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
215 
216 	if (IS_ERR_OR_NULL(segs))
217 		return qdisc_drop(skb, sch, to_free);
218 
219 	nb = 0;
220 	skb_list_walk_safe(segs, segs, nskb) {
221 		skb_mark_not_on_list(segs);
222 		qdisc_skb_cb(segs)->pkt_len = segs->len;
223 		len += segs->len;
224 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
225 		if (ret != NET_XMIT_SUCCESS) {
226 			if (net_xmit_drop_count(ret))
227 				qdisc_qstats_drop(sch);
228 		} else {
229 			nb++;
230 		}
231 	}
232 	sch->q.qlen += nb;
233 	if (nb > 1)
234 		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
235 	consume_skb(skb);
236 	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
237 }
238 
239 static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
240 		       struct sk_buff **to_free)
241 {
242 	struct tbf_sched_data *q = qdisc_priv(sch);
243 	unsigned int len = qdisc_pkt_len(skb);
244 	int ret;
245 
246 	if (qdisc_pkt_len(skb) > q->max_size) {
247 		if (skb_is_gso(skb) &&
248 		    skb_gso_validate_mac_len(skb, q->max_size))
249 			return tbf_segment(skb, sch, to_free);
250 		return qdisc_drop(skb, sch, to_free);
251 	}
252 	ret = qdisc_enqueue(skb, q->qdisc, to_free);
253 	if (ret != NET_XMIT_SUCCESS) {
254 		if (net_xmit_drop_count(ret))
255 			qdisc_qstats_drop(sch);
256 		return ret;
257 	}
258 
259 	sch->qstats.backlog += len;
260 	sch->q.qlen++;
261 	return NET_XMIT_SUCCESS;
262 }
263 
264 static bool tbf_peak_present(const struct tbf_sched_data *q)
265 {
266 	return q->peak.rate_bytes_ps;
267 }
268 
269 static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
270 {
271 	struct tbf_sched_data *q = qdisc_priv(sch);
272 	struct sk_buff *skb;
273 
274 	skb = q->qdisc->ops->peek(q->qdisc);
275 
276 	if (skb) {
277 		s64 now;
278 		s64 toks;
279 		s64 ptoks = 0;
280 		unsigned int len = qdisc_pkt_len(skb);
281 
282 		now = ktime_get_ns();
283 		toks = min_t(s64, now - q->t_c, q->buffer);
284 
285 		if (tbf_peak_present(q)) {
286 			ptoks = toks + q->ptokens;
287 			if (ptoks > q->mtu)
288 				ptoks = q->mtu;
289 			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
290 		}
291 		toks += q->tokens;
292 		if (toks > q->buffer)
293 			toks = q->buffer;
294 		toks -= (s64) psched_l2t_ns(&q->rate, len);
295 
296 		if ((toks|ptoks) >= 0) {
297 			skb = qdisc_dequeue_peeked(q->qdisc);
298 			if (unlikely(!skb))
299 				return NULL;
300 
301 			q->t_c = now;
302 			q->tokens = toks;
303 			q->ptokens = ptoks;
304 			qdisc_qstats_backlog_dec(sch, skb);
305 			sch->q.qlen--;
306 			qdisc_bstats_update(sch, skb);
307 			return skb;
308 		}
309 
310 		qdisc_watchdog_schedule_ns(&q->watchdog,
311 					   now + max_t(long, -toks, -ptoks));
312 
313 		/* Maybe we have a shorter packet in the queue,
314 		   which can be sent now. It sounds cool,
315 		   but, however, this is wrong in principle.
316 		   We MUST NOT reorder packets under these circumstances.
317 
318 		   Really, if we split the flow into independent
319 		   subflows, it would be a very good solution.
320 		   This is the main idea of all FQ algorithms
321 		   (cf. CSZ, HPFQ, HFSC)
322 		 */
323 
324 		qdisc_qstats_overlimit(sch);
325 	}
326 	return NULL;
327 }
328 
329 static void tbf_reset(struct Qdisc *sch)
330 {
331 	struct tbf_sched_data *q = qdisc_priv(sch);
332 
333 	qdisc_reset(q->qdisc);
334 	q->t_c = ktime_get_ns();
335 	q->tokens = q->buffer;
336 	q->ptokens = q->mtu;
337 	qdisc_watchdog_cancel(&q->watchdog);
338 }
339 
340 static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
341 	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
342 	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
343 	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
344 	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
345 	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
346 	[TCA_TBF_BURST] = { .type = NLA_U32 },
347 	[TCA_TBF_PBURST] = { .type = NLA_U32 },
348 };
349 
350 static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
351 		      struct netlink_ext_ack *extack)
352 {
353 	int err;
354 	struct tbf_sched_data *q = qdisc_priv(sch);
355 	struct nlattr *tb[TCA_TBF_MAX + 1];
356 	struct tc_tbf_qopt *qopt;
357 	struct Qdisc *child = NULL;
358 	struct Qdisc *old = NULL;
359 	struct psched_ratecfg rate;
360 	struct psched_ratecfg peak;
361 	u64 max_size;
362 	s64 buffer, mtu;
363 	u64 rate64 = 0, prate64 = 0;
364 
365 	err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
366 					  NULL);
367 	if (err < 0)
368 		return err;
369 
370 	err = -EINVAL;
371 	if (tb[TCA_TBF_PARMS] == NULL)
372 		goto done;
373 
374 	qopt = nla_data(tb[TCA_TBF_PARMS]);
375 	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
376 		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
377 					      tb[TCA_TBF_RTAB],
378 					      NULL));
379 
380 	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
381 			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
382 						      tb[TCA_TBF_PTAB],
383 						      NULL));
384 
385 	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
386 	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
387 
388 	if (tb[TCA_TBF_RATE64])
389 		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
390 	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
391 
392 	if (tb[TCA_TBF_BURST]) {
393 		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
394 		buffer = psched_l2t_ns(&rate, max_size);
395 	} else {
396 		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
397 	}
398 
399 	if (qopt->peakrate.rate) {
400 		if (tb[TCA_TBF_PRATE64])
401 			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
402 		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
403 		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
404 			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
405 					peak.rate_bytes_ps, rate.rate_bytes_ps);
406 			err = -EINVAL;
407 			goto done;
408 		}
409 
410 		if (tb[TCA_TBF_PBURST]) {
411 			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
412 			max_size = min_t(u32, max_size, pburst);
413 			mtu = psched_l2t_ns(&peak, pburst);
414 		} else {
415 			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
416 		}
417 	} else {
418 		memset(&peak, 0, sizeof(peak));
419 	}
420 
421 	if (max_size < psched_mtu(qdisc_dev(sch)))
422 		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
423 				    max_size, qdisc_dev(sch)->name,
424 				    psched_mtu(qdisc_dev(sch)));
425 
426 	if (!max_size) {
427 		err = -EINVAL;
428 		goto done;
429 	}
430 
431 	if (q->qdisc != &noop_qdisc) {
432 		err = fifo_set_limit(q->qdisc, qopt->limit);
433 		if (err)
434 			goto done;
435 	} else if (qopt->limit > 0) {
436 		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
437 					 extack);
438 		if (IS_ERR(child)) {
439 			err = PTR_ERR(child);
440 			goto done;
441 		}
442 
443 		/* child is fifo, no need to check for noop_qdisc */
444 		qdisc_hash_add(child, true);
445 	}
446 
447 	sch_tree_lock(sch);
448 	if (child) {
449 		qdisc_tree_flush_backlog(q->qdisc);
450 		old = q->qdisc;
451 		q->qdisc = child;
452 	}
453 	q->limit = qopt->limit;
454 	if (tb[TCA_TBF_PBURST])
455 		q->mtu = mtu;
456 	else
457 		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
458 	q->max_size = max_size;
459 	if (tb[TCA_TBF_BURST])
460 		q->buffer = buffer;
461 	else
462 		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
463 	q->tokens = q->buffer;
464 	q->ptokens = q->mtu;
465 
466 	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
467 	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
468 
469 	sch_tree_unlock(sch);
470 	qdisc_put(old);
471 	err = 0;
472 
473 	tbf_offload_change(sch);
474 done:
475 	return err;
476 }
477 
478 static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
479 		    struct netlink_ext_ack *extack)
480 {
481 	struct tbf_sched_data *q = qdisc_priv(sch);
482 
483 	qdisc_watchdog_init(&q->watchdog, sch);
484 	q->qdisc = &noop_qdisc;
485 
486 	if (!opt)
487 		return -EINVAL;
488 
489 	q->t_c = ktime_get_ns();
490 
491 	return tbf_change(sch, opt, extack);
492 }
493 
494 static void tbf_destroy(struct Qdisc *sch)
495 {
496 	struct tbf_sched_data *q = qdisc_priv(sch);
497 
498 	qdisc_watchdog_cancel(&q->watchdog);
499 	tbf_offload_destroy(sch);
500 	qdisc_put(q->qdisc);
501 }
502 
503 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
504 {
505 	struct tbf_sched_data *q = qdisc_priv(sch);
506 	struct nlattr *nest;
507 	struct tc_tbf_qopt opt;
508 	int err;
509 
510 	err = tbf_offload_dump(sch);
511 	if (err)
512 		return err;
513 
514 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
515 	if (nest == NULL)
516 		goto nla_put_failure;
517 
518 	opt.limit = q->limit;
519 	psched_ratecfg_getrate(&opt.rate, &q->rate);
520 	if (tbf_peak_present(q))
521 		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
522 	else
523 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
524 	opt.mtu = PSCHED_NS2TICKS(q->mtu);
525 	opt.buffer = PSCHED_NS2TICKS(q->buffer);
526 	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
527 		goto nla_put_failure;
528 	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
529 	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
530 			      TCA_TBF_PAD))
531 		goto nla_put_failure;
532 	if (tbf_peak_present(q) &&
533 	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
534 	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
535 			      TCA_TBF_PAD))
536 		goto nla_put_failure;
537 
538 	return nla_nest_end(skb, nest);
539 
540 nla_put_failure:
541 	nla_nest_cancel(skb, nest);
542 	return -1;
543 }
544 
545 static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
546 			  struct sk_buff *skb, struct tcmsg *tcm)
547 {
548 	struct tbf_sched_data *q = qdisc_priv(sch);
549 
550 	tcm->tcm_handle |= TC_H_MIN(1);
551 	tcm->tcm_info = q->qdisc->handle;
552 
553 	return 0;
554 }
555 
556 static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
557 		     struct Qdisc **old, struct netlink_ext_ack *extack)
558 {
559 	struct tbf_sched_data *q = qdisc_priv(sch);
560 
561 	if (new == NULL)
562 		new = &noop_qdisc;
563 
564 	*old = qdisc_replace(sch, new, &q->qdisc);
565 
566 	tbf_offload_graft(sch, new, *old, extack);
567 	return 0;
568 }
569 
570 static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
571 {
572 	struct tbf_sched_data *q = qdisc_priv(sch);
573 	return q->qdisc;
574 }
575 
576 static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
577 {
578 	return 1;
579 }
580 
581 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
582 {
583 	if (!walker->stop) {
584 		tc_qdisc_stats_dump(sch, 1, walker);
585 	}
586 }
587 
588 static const struct Qdisc_class_ops tbf_class_ops = {
589 	.graft		=	tbf_graft,
590 	.leaf		=	tbf_leaf,
591 	.find		=	tbf_find,
592 	.walk		=	tbf_walk,
593 	.dump		=	tbf_dump_class,
594 };
595 
596 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
597 	.next		=	NULL,
598 	.cl_ops		=	&tbf_class_ops,
599 	.id		=	"tbf",
600 	.priv_size	=	sizeof(struct tbf_sched_data),
601 	.enqueue	=	tbf_enqueue,
602 	.dequeue	=	tbf_dequeue,
603 	.peek		=	qdisc_peek_dequeued,
604 	.init		=	tbf_init,
605 	.reset		=	tbf_reset,
606 	.destroy	=	tbf_destroy,
607 	.change		=	tbf_change,
608 	.dump		=	tbf_dump,
609 	.owner		=	THIS_MODULE,
610 };
611 
612 static int __init tbf_module_init(void)
613 {
614 	return register_qdisc(&tbf_qdisc_ops);
615 }
616 
617 static void __exit tbf_module_exit(void)
618 {
619 	unregister_qdisc(&tbf_qdisc_ops);
620 }
621 module_init(tbf_module_init)
622 module_exit(tbf_module_exit)
623 MODULE_LICENSE("GPL");
624