xref: /openbmc/linux/net/sched/sch_htb.c (revision 3543f883)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 	int			filter_cnt;
106 
107 	int			level;		/* our level (see above) */
108 	unsigned int		children;
109 	struct htb_class	*parent;	/* parent class */
110 
111 	struct net_rate_estimator __rcu *rate_est;
112 
113 	/*
114 	 * Written often fields
115 	 */
116 	struct gnet_stats_basic_sync bstats;
117 	struct gnet_stats_basic_sync bstats_bias;
118 	struct tc_htb_xstats	xstats;	/* our special stats */
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 
203 #define HTB_DIRECT ((struct htb_class *)-1L)
204 
205 /**
206  * htb_classify - classify a packet into class
207  * @skb: the socket buffer
208  * @sch: the active queue discipline
209  * @qerr: pointer for returned status code
210  *
211  * It returns NULL if the packet should be dropped or -1 if the packet
212  * should be passed directly thru. In all other cases leaf class is returned.
213  * We allow direct class selection by classid in priority. The we examine
214  * filters in qdisc and in inner nodes (if higher filter points to the inner
215  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
216  * internal fifo (direct). These packets then go directly thru. If we still
217  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
218  * then finish and return direct queue.
219  */
220 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
221 				      int *qerr)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct htb_class *cl;
225 	struct tcf_result res;
226 	struct tcf_proto *tcf;
227 	int result;
228 
229 	/* allow to select class by setting skb->priority to valid classid;
230 	 * note that nfmark can be used too by attaching filter fw with no
231 	 * rules in it
232 	 */
233 	if (skb->priority == sch->handle)
234 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
235 	cl = htb_find(skb->priority, sch);
236 	if (cl) {
237 		if (cl->level == 0)
238 			return cl;
239 		/* Start with inner filter chain if a non-leaf class is selected */
240 		tcf = rcu_dereference_bh(cl->filter_list);
241 	} else {
242 		tcf = rcu_dereference_bh(q->filter_list);
243 	}
244 
245 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
246 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		}
257 #endif
258 		cl = (void *)res.class;
259 		if (!cl) {
260 			if (res.classid == sch->handle)
261 				return HTB_DIRECT;	/* X:0 (direct flow) */
262 			cl = htb_find(res.classid, sch);
263 			if (!cl)
264 				break;	/* filter selected invalid classid */
265 		}
266 		if (!cl->level)
267 			return cl;	/* we hit leaf; return it */
268 
269 		/* we have got inner class; apply inner filter chain */
270 		tcf = rcu_dereference_bh(cl->filter_list);
271 	}
272 	/* classification failed; try to use default class */
273 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
274 	if (!cl || cl->level)
275 		return HTB_DIRECT;	/* bad default .. this is safe bet */
276 	return cl;
277 }
278 
279 /**
280  * htb_add_to_id_tree - adds class to the round robin list
281  * @root: the root of the tree
282  * @cl: the class to add
283  * @prio: the give prio in class
284  *
285  * Routine adds class to the list (actually tree) sorted by classid.
286  * Make sure that class is not already on such list for given prio.
287  */
288 static void htb_add_to_id_tree(struct rb_root *root,
289 			       struct htb_class *cl, int prio)
290 {
291 	struct rb_node **p = &root->rb_node, *parent = NULL;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, node[prio]);
297 
298 		if (cl->common.classid > c->common.classid)
299 			p = &parent->rb_right;
300 		else
301 			p = &parent->rb_left;
302 	}
303 	rb_link_node(&cl->node[prio], parent, p);
304 	rb_insert_color(&cl->node[prio], root);
305 }
306 
307 /**
308  * htb_add_to_wait_tree - adds class to the event queue with delay
309  * @q: the priority event queue
310  * @cl: the class to add
311  * @delay: delay in microseconds
312  *
313  * The class is added to priority event queue to indicate that class will
314  * change its mode in cl->pq_key microseconds. Make sure that class is not
315  * already in the queue.
316  */
317 static void htb_add_to_wait_tree(struct htb_sched *q,
318 				 struct htb_class *cl, s64 delay)
319 {
320 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
321 
322 	cl->pq_key = q->now + delay;
323 	if (cl->pq_key == q->now)
324 		cl->pq_key++;
325 
326 	/* update the nearest event cache */
327 	if (q->near_ev_cache[cl->level] > cl->pq_key)
328 		q->near_ev_cache[cl->level] = cl->pq_key;
329 
330 	while (*p) {
331 		struct htb_class *c;
332 		parent = *p;
333 		c = rb_entry(parent, struct htb_class, pq_node);
334 		if (cl->pq_key >= c->pq_key)
335 			p = &parent->rb_right;
336 		else
337 			p = &parent->rb_left;
338 	}
339 	rb_link_node(&cl->pq_node, parent, p);
340 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
341 }
342 
343 /**
344  * htb_next_rb_node - finds next node in binary tree
345  * @n: the current node in binary tree
346  *
347  * When we are past last key we return NULL.
348  * Average complexity is 2 steps per call.
349  */
350 static inline void htb_next_rb_node(struct rb_node **n)
351 {
352 	*n = rb_next(*n);
353 }
354 
355 /**
356  * htb_add_class_to_row - add class to its row
357  * @q: the priority event queue
358  * @cl: the class to add
359  * @mask: the given priorities in class in bitmap
360  *
361  * The class is added to row at priorities marked in mask.
362  * It does nothing if mask == 0.
363  */
364 static inline void htb_add_class_to_row(struct htb_sched *q,
365 					struct htb_class *cl, int mask)
366 {
367 	q->row_mask[cl->level] |= mask;
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		mask &= ~(1 << prio);
371 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
372 	}
373 }
374 
375 /* If this triggers, it is a bug in this code, but it need not be fatal */
376 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
377 {
378 	if (RB_EMPTY_NODE(rb)) {
379 		WARN_ON(1);
380 	} else {
381 		rb_erase(rb, root);
382 		RB_CLEAR_NODE(rb);
383 	}
384 }
385 
386 
387 /**
388  * htb_remove_class_from_row - removes class from its row
389  * @q: the priority event queue
390  * @cl: the class to add
391  * @mask: the given priorities in class in bitmap
392  *
393  * The class is removed from row at priorities marked in mask.
394  * It does nothing if mask == 0.
395  */
396 static inline void htb_remove_class_from_row(struct htb_sched *q,
397 						 struct htb_class *cl, int mask)
398 {
399 	int m = 0;
400 	struct htb_level *hlevel = &q->hlevel[cl->level];
401 
402 	while (mask) {
403 		int prio = ffz(~mask);
404 		struct htb_prio *hprio = &hlevel->hprio[prio];
405 
406 		mask &= ~(1 << prio);
407 		if (hprio->ptr == cl->node + prio)
408 			htb_next_rb_node(&hprio->ptr);
409 
410 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
411 		if (!hprio->row.rb_node)
412 			m |= 1 << prio;
413 	}
414 	q->row_mask[cl->level] &= ~m;
415 }
416 
417 /**
418  * htb_activate_prios - creates active classe's feed chain
419  * @q: the priority event queue
420  * @cl: the class to activate
421  *
422  * The class is connected to ancestors and/or appropriate rows
423  * for priorities it is participating on. cl->cmode must be new
424  * (activated) mode. It does nothing if cl->prio_activity == 0.
425  */
426 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
427 {
428 	struct htb_class *p = cl->parent;
429 	long m, mask = cl->prio_activity;
430 
431 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 		m = mask;
433 		while (m) {
434 			int prio = ffz(~m);
435 			m &= ~(1 << prio);
436 
437 			if (p->inner.clprio[prio].feed.rb_node)
438 				/* parent already has its feed in use so that
439 				 * reset bit in mask as parent is already ok
440 				 */
441 				mask &= ~(1 << prio);
442 
443 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
444 		}
445 		p->prio_activity |= mask;
446 		cl = p;
447 		p = cl->parent;
448 
449 	}
450 	if (cl->cmode == HTB_CAN_SEND && mask)
451 		htb_add_class_to_row(q, cl, mask);
452 }
453 
454 /**
455  * htb_deactivate_prios - remove class from feed chain
456  * @q: the priority event queue
457  * @cl: the class to deactivate
458  *
459  * cl->cmode must represent old mode (before deactivation). It does
460  * nothing if cl->prio_activity == 0. Class is removed from all feed
461  * chains and rows.
462  */
463 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
464 {
465 	struct htb_class *p = cl->parent;
466 	long m, mask = cl->prio_activity;
467 
468 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
469 		m = mask;
470 		mask = 0;
471 		while (m) {
472 			int prio = ffz(~m);
473 			m &= ~(1 << prio);
474 
475 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
476 				/* we are removing child which is pointed to from
477 				 * parent feed - forget the pointer but remember
478 				 * classid
479 				 */
480 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
481 				p->inner.clprio[prio].ptr = NULL;
482 			}
483 
484 			htb_safe_rb_erase(cl->node + prio,
485 					  &p->inner.clprio[prio].feed);
486 
487 			if (!p->inner.clprio[prio].feed.rb_node)
488 				mask |= 1 << prio;
489 		}
490 
491 		p->prio_activity &= ~mask;
492 		cl = p;
493 		p = cl->parent;
494 
495 	}
496 	if (cl->cmode == HTB_CAN_SEND && mask)
497 		htb_remove_class_from_row(q, cl, mask);
498 }
499 
500 static inline s64 htb_lowater(const struct htb_class *cl)
501 {
502 	if (htb_hysteresis)
503 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
504 	else
505 		return 0;
506 }
507 static inline s64 htb_hiwater(const struct htb_class *cl)
508 {
509 	if (htb_hysteresis)
510 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
511 	else
512 		return 0;
513 }
514 
515 
516 /**
517  * htb_class_mode - computes and returns current class mode
518  * @cl: the target class
519  * @diff: diff time in microseconds
520  *
521  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
522  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
523  * from now to time when cl will change its state.
524  * Also it is worth to note that class mode doesn't change simply
525  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
526  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
527  * mode transitions per time unit. The speed gain is about 1/6.
528  */
529 static inline enum htb_cmode
530 htb_class_mode(struct htb_class *cl, s64 *diff)
531 {
532 	s64 toks;
533 
534 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
535 		*diff = -toks;
536 		return HTB_CANT_SEND;
537 	}
538 
539 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
540 		return HTB_CAN_SEND;
541 
542 	*diff = -toks;
543 	return HTB_MAY_BORROW;
544 }
545 
546 /**
547  * htb_change_class_mode - changes classe's mode
548  * @q: the priority event queue
549  * @cl: the target class
550  * @diff: diff time in microseconds
551  *
552  * This should be the only way how to change classe's mode under normal
553  * circumstances. Routine will update feed lists linkage, change mode
554  * and add class to the wait event queue if appropriate. New mode should
555  * be different from old one and cl->pq_key has to be valid if changing
556  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
557  */
558 static void
559 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
560 {
561 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
562 
563 	if (new_mode == cl->cmode)
564 		return;
565 
566 	if (new_mode == HTB_CANT_SEND) {
567 		cl->overlimits++;
568 		q->overlimits++;
569 	}
570 
571 	if (cl->prio_activity) {	/* not necessary: speed optimization */
572 		if (cl->cmode != HTB_CANT_SEND)
573 			htb_deactivate_prios(q, cl);
574 		cl->cmode = new_mode;
575 		if (new_mode != HTB_CANT_SEND)
576 			htb_activate_prios(q, cl);
577 	} else
578 		cl->cmode = new_mode;
579 }
580 
581 /**
582  * htb_activate - inserts leaf cl into appropriate active feeds
583  * @q: the priority event queue
584  * @cl: the target class
585  *
586  * Routine learns (new) priority of leaf and activates feed chain
587  * for the prio. It can be called on already active leaf safely.
588  * It also adds leaf into droplist.
589  */
590 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
591 {
592 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
593 
594 	if (!cl->prio_activity) {
595 		cl->prio_activity = 1 << cl->prio;
596 		htb_activate_prios(q, cl);
597 	}
598 }
599 
600 /**
601  * htb_deactivate - remove leaf cl from active feeds
602  * @q: the priority event queue
603  * @cl: the target class
604  *
605  * Make sure that leaf is active. In the other words it can't be called
606  * with non-active leaf. It also removes class from the drop list.
607  */
608 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
609 {
610 	WARN_ON(!cl->prio_activity);
611 
612 	htb_deactivate_prios(q, cl);
613 	cl->prio_activity = 0;
614 }
615 
616 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
617 		       struct sk_buff **to_free)
618 {
619 	int ret;
620 	unsigned int len = qdisc_pkt_len(skb);
621 	struct htb_sched *q = qdisc_priv(sch);
622 	struct htb_class *cl = htb_classify(skb, sch, &ret);
623 
624 	if (cl == HTB_DIRECT) {
625 		/* enqueue to helper queue */
626 		if (q->direct_queue.qlen < q->direct_qlen) {
627 			__qdisc_enqueue_tail(skb, &q->direct_queue);
628 			q->direct_pkts++;
629 		} else {
630 			return qdisc_drop(skb, sch, to_free);
631 		}
632 #ifdef CONFIG_NET_CLS_ACT
633 	} else if (!cl) {
634 		if (ret & __NET_XMIT_BYPASS)
635 			qdisc_qstats_drop(sch);
636 		__qdisc_drop(skb, to_free);
637 		return ret;
638 #endif
639 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
640 					to_free)) != NET_XMIT_SUCCESS) {
641 		if (net_xmit_drop_count(ret)) {
642 			qdisc_qstats_drop(sch);
643 			cl->drops++;
644 		}
645 		return ret;
646 	} else {
647 		htb_activate(q, cl);
648 	}
649 
650 	sch->qstats.backlog += len;
651 	sch->q.qlen++;
652 	return NET_XMIT_SUCCESS;
653 }
654 
655 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
656 {
657 	s64 toks = diff + cl->tokens;
658 
659 	if (toks > cl->buffer)
660 		toks = cl->buffer;
661 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
662 	if (toks <= -cl->mbuffer)
663 		toks = 1 - cl->mbuffer;
664 
665 	cl->tokens = toks;
666 }
667 
668 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
669 {
670 	s64 toks = diff + cl->ctokens;
671 
672 	if (toks > cl->cbuffer)
673 		toks = cl->cbuffer;
674 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
675 	if (toks <= -cl->mbuffer)
676 		toks = 1 - cl->mbuffer;
677 
678 	cl->ctokens = toks;
679 }
680 
681 /**
682  * htb_charge_class - charges amount "bytes" to leaf and ancestors
683  * @q: the priority event queue
684  * @cl: the class to start iterate
685  * @level: the minimum level to account
686  * @skb: the socket buffer
687  *
688  * Routine assumes that packet "bytes" long was dequeued from leaf cl
689  * borrowing from "level". It accounts bytes to ceil leaky bucket for
690  * leaf and all ancestors and to rate bucket for ancestors at levels
691  * "level" and higher. It also handles possible change of mode resulting
692  * from the update. Note that mode can also increase here (MAY_BORROW to
693  * CAN_SEND) because we can use more precise clock that event queue here.
694  * In such case we remove class from event queue first.
695  */
696 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
697 			     int level, struct sk_buff *skb)
698 {
699 	int bytes = qdisc_pkt_len(skb);
700 	enum htb_cmode old_mode;
701 	s64 diff;
702 
703 	while (cl) {
704 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
705 		if (cl->level >= level) {
706 			if (cl->level == level)
707 				cl->xstats.lends++;
708 			htb_accnt_tokens(cl, bytes, diff);
709 		} else {
710 			cl->xstats.borrows++;
711 			cl->tokens += diff;	/* we moved t_c; update tokens */
712 		}
713 		htb_accnt_ctokens(cl, bytes, diff);
714 		cl->t_c = q->now;
715 
716 		old_mode = cl->cmode;
717 		diff = 0;
718 		htb_change_class_mode(q, cl, &diff);
719 		if (old_mode != cl->cmode) {
720 			if (old_mode != HTB_CAN_SEND)
721 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
722 			if (cl->cmode != HTB_CAN_SEND)
723 				htb_add_to_wait_tree(q, cl, diff);
724 		}
725 
726 		/* update basic stats except for leaves which are already updated */
727 		if (cl->level)
728 			bstats_update(&cl->bstats, skb);
729 
730 		cl = cl->parent;
731 	}
732 }
733 
734 /**
735  * htb_do_events - make mode changes to classes at the level
736  * @q: the priority event queue
737  * @level: which wait_pq in 'q->hlevel'
738  * @start: start jiffies
739  *
740  * Scans event queue for pending events and applies them. Returns time of
741  * next pending event (0 for no event in pq, q->now for too many events).
742  * Note: Applied are events whose have cl->pq_key <= q->now.
743  */
744 static s64 htb_do_events(struct htb_sched *q, const int level,
745 			 unsigned long start)
746 {
747 	/* don't run for longer than 2 jiffies; 2 is used instead of
748 	 * 1 to simplify things when jiffy is going to be incremented
749 	 * too soon
750 	 */
751 	unsigned long stop_at = start + 2;
752 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
753 
754 	while (time_before(jiffies, stop_at)) {
755 		struct htb_class *cl;
756 		s64 diff;
757 		struct rb_node *p = rb_first(wait_pq);
758 
759 		if (!p)
760 			return 0;
761 
762 		cl = rb_entry(p, struct htb_class, pq_node);
763 		if (cl->pq_key > q->now)
764 			return cl->pq_key;
765 
766 		htb_safe_rb_erase(p, wait_pq);
767 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
768 		htb_change_class_mode(q, cl, &diff);
769 		if (cl->cmode != HTB_CAN_SEND)
770 			htb_add_to_wait_tree(q, cl, diff);
771 	}
772 
773 	/* too much load - let's continue after a break for scheduling */
774 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
775 		pr_warn("htb: too many events!\n");
776 		q->warned |= HTB_WARN_TOOMANYEVENTS;
777 	}
778 
779 	return q->now;
780 }
781 
782 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
783  * is no such one exists.
784  */
785 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
786 					      u32 id)
787 {
788 	struct rb_node *r = NULL;
789 	while (n) {
790 		struct htb_class *cl =
791 		    rb_entry(n, struct htb_class, node[prio]);
792 
793 		if (id > cl->common.classid) {
794 			n = n->rb_right;
795 		} else if (id < cl->common.classid) {
796 			r = n;
797 			n = n->rb_left;
798 		} else {
799 			return n;
800 		}
801 	}
802 	return r;
803 }
804 
805 /**
806  * htb_lookup_leaf - returns next leaf class in DRR order
807  * @hprio: the current one
808  * @prio: which prio in class
809  *
810  * Find leaf where current feed pointers points to.
811  */
812 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
813 {
814 	int i;
815 	struct {
816 		struct rb_node *root;
817 		struct rb_node **pptr;
818 		u32 *pid;
819 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
820 
821 	BUG_ON(!hprio->row.rb_node);
822 	sp->root = hprio->row.rb_node;
823 	sp->pptr = &hprio->ptr;
824 	sp->pid = &hprio->last_ptr_id;
825 
826 	for (i = 0; i < 65535; i++) {
827 		if (!*sp->pptr && *sp->pid) {
828 			/* ptr was invalidated but id is valid - try to recover
829 			 * the original or next ptr
830 			 */
831 			*sp->pptr =
832 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
833 		}
834 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
835 				 * can become out of date quickly
836 				 */
837 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
838 			*sp->pptr = sp->root;
839 			while ((*sp->pptr)->rb_left)
840 				*sp->pptr = (*sp->pptr)->rb_left;
841 			if (sp > stk) {
842 				sp--;
843 				if (!*sp->pptr) {
844 					WARN_ON(1);
845 					return NULL;
846 				}
847 				htb_next_rb_node(sp->pptr);
848 			}
849 		} else {
850 			struct htb_class *cl;
851 			struct htb_prio *clp;
852 
853 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
854 			if (!cl->level)
855 				return cl;
856 			clp = &cl->inner.clprio[prio];
857 			(++sp)->root = clp->feed.rb_node;
858 			sp->pptr = &clp->ptr;
859 			sp->pid = &clp->last_ptr_id;
860 		}
861 	}
862 	WARN_ON(1);
863 	return NULL;
864 }
865 
866 /* dequeues packet at given priority and level; call only if
867  * you are sure that there is active class at prio/level
868  */
869 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
870 					const int level)
871 {
872 	struct sk_buff *skb = NULL;
873 	struct htb_class *cl, *start;
874 	struct htb_level *hlevel = &q->hlevel[level];
875 	struct htb_prio *hprio = &hlevel->hprio[prio];
876 
877 	/* look initial class up in the row */
878 	start = cl = htb_lookup_leaf(hprio, prio);
879 
880 	do {
881 next:
882 		if (unlikely(!cl))
883 			return NULL;
884 
885 		/* class can be empty - it is unlikely but can be true if leaf
886 		 * qdisc drops packets in enqueue routine or if someone used
887 		 * graft operation on the leaf since last dequeue;
888 		 * simply deactivate and skip such class
889 		 */
890 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
891 			struct htb_class *next;
892 			htb_deactivate(q, cl);
893 
894 			/* row/level might become empty */
895 			if ((q->row_mask[level] & (1 << prio)) == 0)
896 				return NULL;
897 
898 			next = htb_lookup_leaf(hprio, prio);
899 
900 			if (cl == start)	/* fix start if we just deleted it */
901 				start = next;
902 			cl = next;
903 			goto next;
904 		}
905 
906 		skb = cl->leaf.q->dequeue(cl->leaf.q);
907 		if (likely(skb != NULL))
908 			break;
909 
910 		qdisc_warn_nonwc("htb", cl->leaf.q);
911 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
912 					 &q->hlevel[0].hprio[prio].ptr);
913 		cl = htb_lookup_leaf(hprio, prio);
914 
915 	} while (cl != start);
916 
917 	if (likely(skb != NULL)) {
918 		bstats_update(&cl->bstats, skb);
919 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
920 		if (cl->leaf.deficit[level] < 0) {
921 			cl->leaf.deficit[level] += cl->quantum;
922 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
923 						 &q->hlevel[0].hprio[prio].ptr);
924 		}
925 		/* this used to be after charge_class but this constelation
926 		 * gives us slightly better performance
927 		 */
928 		if (!cl->leaf.q->q.qlen)
929 			htb_deactivate(q, cl);
930 		htb_charge_class(q, cl, level, skb);
931 	}
932 	return skb;
933 }
934 
935 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
936 {
937 	struct sk_buff *skb;
938 	struct htb_sched *q = qdisc_priv(sch);
939 	int level;
940 	s64 next_event;
941 	unsigned long start_at;
942 
943 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
944 	skb = __qdisc_dequeue_head(&q->direct_queue);
945 	if (skb != NULL) {
946 ok:
947 		qdisc_bstats_update(sch, skb);
948 		qdisc_qstats_backlog_dec(sch, skb);
949 		sch->q.qlen--;
950 		return skb;
951 	}
952 
953 	if (!sch->q.qlen)
954 		goto fin;
955 	q->now = ktime_get_ns();
956 	start_at = jiffies;
957 
958 	next_event = q->now + 5LLU * NSEC_PER_SEC;
959 
960 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
961 		/* common case optimization - skip event handler quickly */
962 		int m;
963 		s64 event = q->near_ev_cache[level];
964 
965 		if (q->now >= event) {
966 			event = htb_do_events(q, level, start_at);
967 			if (!event)
968 				event = q->now + NSEC_PER_SEC;
969 			q->near_ev_cache[level] = event;
970 		}
971 
972 		if (next_event > event)
973 			next_event = event;
974 
975 		m = ~q->row_mask[level];
976 		while (m != (int)(-1)) {
977 			int prio = ffz(m);
978 
979 			m |= 1 << prio;
980 			skb = htb_dequeue_tree(q, prio, level);
981 			if (likely(skb != NULL))
982 				goto ok;
983 		}
984 	}
985 	if (likely(next_event > q->now))
986 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
987 	else
988 		schedule_work(&q->work);
989 fin:
990 	return skb;
991 }
992 
993 /* reset all classes */
994 /* always caled under BH & queue lock */
995 static void htb_reset(struct Qdisc *sch)
996 {
997 	struct htb_sched *q = qdisc_priv(sch);
998 	struct htb_class *cl;
999 	unsigned int i;
1000 
1001 	for (i = 0; i < q->clhash.hashsize; i++) {
1002 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1003 			if (cl->level)
1004 				memset(&cl->inner, 0, sizeof(cl->inner));
1005 			else {
1006 				if (cl->leaf.q && !q->offload)
1007 					qdisc_reset(cl->leaf.q);
1008 			}
1009 			cl->prio_activity = 0;
1010 			cl->cmode = HTB_CAN_SEND;
1011 		}
1012 	}
1013 	qdisc_watchdog_cancel(&q->watchdog);
1014 	__qdisc_reset_queue(&q->direct_queue);
1015 	memset(q->hlevel, 0, sizeof(q->hlevel));
1016 	memset(q->row_mask, 0, sizeof(q->row_mask));
1017 }
1018 
1019 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1020 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1021 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1022 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1023 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1024 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1025 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1026 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1027 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1028 };
1029 
1030 static void htb_work_func(struct work_struct *work)
1031 {
1032 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1033 	struct Qdisc *sch = q->watchdog.qdisc;
1034 
1035 	rcu_read_lock();
1036 	__netif_schedule(qdisc_root(sch));
1037 	rcu_read_unlock();
1038 }
1039 
1040 static void htb_set_lockdep_class_child(struct Qdisc *q)
1041 {
1042 	static struct lock_class_key child_key;
1043 
1044 	lockdep_set_class(qdisc_lock(q), &child_key);
1045 }
1046 
1047 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1048 {
1049 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1050 }
1051 
1052 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1053 		    struct netlink_ext_ack *extack)
1054 {
1055 	struct net_device *dev = qdisc_dev(sch);
1056 	struct tc_htb_qopt_offload offload_opt;
1057 	struct htb_sched *q = qdisc_priv(sch);
1058 	struct nlattr *tb[TCA_HTB_MAX + 1];
1059 	struct tc_htb_glob *gopt;
1060 	unsigned int ntx;
1061 	bool offload;
1062 	int err;
1063 
1064 	qdisc_watchdog_init(&q->watchdog, sch);
1065 	INIT_WORK(&q->work, htb_work_func);
1066 
1067 	if (!opt)
1068 		return -EINVAL;
1069 
1070 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1071 	if (err)
1072 		return err;
1073 
1074 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1075 					  NULL);
1076 	if (err < 0)
1077 		return err;
1078 
1079 	if (!tb[TCA_HTB_INIT])
1080 		return -EINVAL;
1081 
1082 	gopt = nla_data(tb[TCA_HTB_INIT]);
1083 	if (gopt->version != HTB_VER >> 16)
1084 		return -EINVAL;
1085 
1086 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1087 
1088 	if (offload) {
1089 		if (sch->parent != TC_H_ROOT) {
1090 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1091 			return -EOPNOTSUPP;
1092 		}
1093 
1094 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1095 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1096 			return -EOPNOTSUPP;
1097 		}
1098 
1099 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1100 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1101 					   sizeof(*q->direct_qdiscs),
1102 					   GFP_KERNEL);
1103 		if (!q->direct_qdiscs)
1104 			return -ENOMEM;
1105 	}
1106 
1107 	err = qdisc_class_hash_init(&q->clhash);
1108 	if (err < 0)
1109 		return err;
1110 
1111 	if (tb[TCA_HTB_DIRECT_QLEN])
1112 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1113 	else
1114 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1115 
1116 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1117 		q->rate2quantum = 1;
1118 	q->defcls = gopt->defcls;
1119 
1120 	if (!offload)
1121 		return 0;
1122 
1123 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1124 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1125 		struct Qdisc *qdisc;
1126 
1127 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1128 					  TC_H_MAKE(sch->handle, 0), extack);
1129 		if (!qdisc) {
1130 			return -ENOMEM;
1131 		}
1132 
1133 		htb_set_lockdep_class_child(qdisc);
1134 		q->direct_qdiscs[ntx] = qdisc;
1135 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1136 	}
1137 
1138 	sch->flags |= TCQ_F_MQROOT;
1139 
1140 	offload_opt = (struct tc_htb_qopt_offload) {
1141 		.command = TC_HTB_CREATE,
1142 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1143 		.classid = TC_H_MIN(q->defcls),
1144 		.extack = extack,
1145 	};
1146 	err = htb_offload(dev, &offload_opt);
1147 	if (err)
1148 		return err;
1149 
1150 	/* Defer this assignment, so that htb_destroy skips offload-related
1151 	 * parts (especially calling ndo_setup_tc) on errors.
1152 	 */
1153 	q->offload = true;
1154 
1155 	return 0;
1156 }
1157 
1158 static void htb_attach_offload(struct Qdisc *sch)
1159 {
1160 	struct net_device *dev = qdisc_dev(sch);
1161 	struct htb_sched *q = qdisc_priv(sch);
1162 	unsigned int ntx;
1163 
1164 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1165 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1166 
1167 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1168 		qdisc_put(old);
1169 		qdisc_hash_add(qdisc, false);
1170 	}
1171 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1172 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1173 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1174 
1175 		qdisc_put(old);
1176 	}
1177 
1178 	kfree(q->direct_qdiscs);
1179 	q->direct_qdiscs = NULL;
1180 }
1181 
1182 static void htb_attach_software(struct Qdisc *sch)
1183 {
1184 	struct net_device *dev = qdisc_dev(sch);
1185 	unsigned int ntx;
1186 
1187 	/* Resemble qdisc_graft behavior. */
1188 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1189 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1190 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1191 
1192 		qdisc_refcount_inc(sch);
1193 
1194 		qdisc_put(old);
1195 	}
1196 }
1197 
1198 static void htb_attach(struct Qdisc *sch)
1199 {
1200 	struct htb_sched *q = qdisc_priv(sch);
1201 
1202 	if (q->offload)
1203 		htb_attach_offload(sch);
1204 	else
1205 		htb_attach_software(sch);
1206 }
1207 
1208 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1209 {
1210 	struct htb_sched *q = qdisc_priv(sch);
1211 	struct nlattr *nest;
1212 	struct tc_htb_glob gopt;
1213 
1214 	if (q->offload)
1215 		sch->flags |= TCQ_F_OFFLOADED;
1216 	else
1217 		sch->flags &= ~TCQ_F_OFFLOADED;
1218 
1219 	sch->qstats.overlimits = q->overlimits;
1220 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1221 	 * no change can happen on the qdisc parameters.
1222 	 */
1223 
1224 	gopt.direct_pkts = q->direct_pkts;
1225 	gopt.version = HTB_VER;
1226 	gopt.rate2quantum = q->rate2quantum;
1227 	gopt.defcls = q->defcls;
1228 	gopt.debug = 0;
1229 
1230 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1231 	if (nest == NULL)
1232 		goto nla_put_failure;
1233 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1234 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1235 		goto nla_put_failure;
1236 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1237 		goto nla_put_failure;
1238 
1239 	return nla_nest_end(skb, nest);
1240 
1241 nla_put_failure:
1242 	nla_nest_cancel(skb, nest);
1243 	return -1;
1244 }
1245 
1246 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1247 			  struct sk_buff *skb, struct tcmsg *tcm)
1248 {
1249 	struct htb_class *cl = (struct htb_class *)arg;
1250 	struct htb_sched *q = qdisc_priv(sch);
1251 	struct nlattr *nest;
1252 	struct tc_htb_opt opt;
1253 
1254 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1255 	 * no change can happen on the class parameters.
1256 	 */
1257 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1258 	tcm->tcm_handle = cl->common.classid;
1259 	if (!cl->level && cl->leaf.q)
1260 		tcm->tcm_info = cl->leaf.q->handle;
1261 
1262 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1263 	if (nest == NULL)
1264 		goto nla_put_failure;
1265 
1266 	memset(&opt, 0, sizeof(opt));
1267 
1268 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1269 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1270 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1271 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1272 	opt.quantum = cl->quantum;
1273 	opt.prio = cl->prio;
1274 	opt.level = cl->level;
1275 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1276 		goto nla_put_failure;
1277 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1278 		goto nla_put_failure;
1279 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1280 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1281 			      TCA_HTB_PAD))
1282 		goto nla_put_failure;
1283 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1284 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1285 			      TCA_HTB_PAD))
1286 		goto nla_put_failure;
1287 
1288 	return nla_nest_end(skb, nest);
1289 
1290 nla_put_failure:
1291 	nla_nest_cancel(skb, nest);
1292 	return -1;
1293 }
1294 
1295 static void htb_offload_aggregate_stats(struct htb_sched *q,
1296 					struct htb_class *cl)
1297 {
1298 	u64 bytes = 0, packets = 0;
1299 	struct htb_class *c;
1300 	unsigned int i;
1301 
1302 	gnet_stats_basic_sync_init(&cl->bstats);
1303 
1304 	for (i = 0; i < q->clhash.hashsize; i++) {
1305 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1306 			struct htb_class *p = c;
1307 
1308 			while (p && p->level < cl->level)
1309 				p = p->parent;
1310 
1311 			if (p != cl)
1312 				continue;
1313 
1314 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1315 			packets += u64_stats_read(&c->bstats_bias.packets);
1316 			if (c->level == 0) {
1317 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1318 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1319 			}
1320 		}
1321 	}
1322 	_bstats_update(&cl->bstats, bytes, packets);
1323 }
1324 
1325 static int
1326 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1327 {
1328 	struct htb_class *cl = (struct htb_class *)arg;
1329 	struct htb_sched *q = qdisc_priv(sch);
1330 	struct gnet_stats_queue qs = {
1331 		.drops = cl->drops,
1332 		.overlimits = cl->overlimits,
1333 	};
1334 	__u32 qlen = 0;
1335 
1336 	if (!cl->level && cl->leaf.q)
1337 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1338 
1339 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1340 				    INT_MIN, INT_MAX);
1341 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1342 				     INT_MIN, INT_MAX);
1343 
1344 	if (q->offload) {
1345 		if (!cl->level) {
1346 			if (cl->leaf.q)
1347 				cl->bstats = cl->leaf.q->bstats;
1348 			else
1349 				gnet_stats_basic_sync_init(&cl->bstats);
1350 			_bstats_update(&cl->bstats,
1351 				       u64_stats_read(&cl->bstats_bias.bytes),
1352 				       u64_stats_read(&cl->bstats_bias.packets));
1353 		} else {
1354 			htb_offload_aggregate_stats(q, cl);
1355 		}
1356 	}
1357 
1358 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1359 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1360 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1361 		return -1;
1362 
1363 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1364 }
1365 
1366 static struct netdev_queue *
1367 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1368 {
1369 	struct net_device *dev = qdisc_dev(sch);
1370 	struct tc_htb_qopt_offload offload_opt;
1371 	struct htb_sched *q = qdisc_priv(sch);
1372 	int err;
1373 
1374 	if (!q->offload)
1375 		return sch->dev_queue;
1376 
1377 	offload_opt = (struct tc_htb_qopt_offload) {
1378 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1379 		.classid = TC_H_MIN(tcm->tcm_parent),
1380 	};
1381 	err = htb_offload(dev, &offload_opt);
1382 	if (err || offload_opt.qid >= dev->num_tx_queues)
1383 		return NULL;
1384 	return netdev_get_tx_queue(dev, offload_opt.qid);
1385 }
1386 
1387 static struct Qdisc *
1388 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1389 {
1390 	struct net_device *dev = dev_queue->dev;
1391 	struct Qdisc *old_q;
1392 
1393 	if (dev->flags & IFF_UP)
1394 		dev_deactivate(dev);
1395 	old_q = dev_graft_qdisc(dev_queue, new_q);
1396 	if (new_q)
1397 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1398 	if (dev->flags & IFF_UP)
1399 		dev_activate(dev);
1400 
1401 	return old_q;
1402 }
1403 
1404 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1405 {
1406 	struct netdev_queue *queue;
1407 
1408 	queue = cl->leaf.offload_queue;
1409 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1410 		WARN_ON(cl->leaf.q->dev_queue != queue);
1411 
1412 	return queue;
1413 }
1414 
1415 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1416 				   struct htb_class *cl_new, bool destroying)
1417 {
1418 	struct netdev_queue *queue_old, *queue_new;
1419 	struct net_device *dev = qdisc_dev(sch);
1420 
1421 	queue_old = htb_offload_get_queue(cl_old);
1422 	queue_new = htb_offload_get_queue(cl_new);
1423 
1424 	if (!destroying) {
1425 		struct Qdisc *qdisc;
1426 
1427 		if (dev->flags & IFF_UP)
1428 			dev_deactivate(dev);
1429 		qdisc = dev_graft_qdisc(queue_old, NULL);
1430 		WARN_ON(qdisc != cl_old->leaf.q);
1431 	}
1432 
1433 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1434 		cl_old->leaf.q->dev_queue = queue_new;
1435 	cl_old->leaf.offload_queue = queue_new;
1436 
1437 	if (!destroying) {
1438 		struct Qdisc *qdisc;
1439 
1440 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1441 		if (dev->flags & IFF_UP)
1442 			dev_activate(dev);
1443 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1444 	}
1445 }
1446 
1447 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1448 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1449 {
1450 	struct netdev_queue *dev_queue = sch->dev_queue;
1451 	struct htb_class *cl = (struct htb_class *)arg;
1452 	struct htb_sched *q = qdisc_priv(sch);
1453 	struct Qdisc *old_q;
1454 
1455 	if (cl->level)
1456 		return -EINVAL;
1457 
1458 	if (q->offload)
1459 		dev_queue = htb_offload_get_queue(cl);
1460 
1461 	if (!new) {
1462 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1463 					cl->common.classid, extack);
1464 		if (!new)
1465 			return -ENOBUFS;
1466 	}
1467 
1468 	if (q->offload) {
1469 		htb_set_lockdep_class_child(new);
1470 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1471 		qdisc_refcount_inc(new);
1472 		old_q = htb_graft_helper(dev_queue, new);
1473 	}
1474 
1475 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1476 
1477 	if (q->offload) {
1478 		WARN_ON(old_q != *old);
1479 		qdisc_put(old_q);
1480 	}
1481 
1482 	return 0;
1483 }
1484 
1485 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1486 {
1487 	struct htb_class *cl = (struct htb_class *)arg;
1488 	return !cl->level ? cl->leaf.q : NULL;
1489 }
1490 
1491 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1492 {
1493 	struct htb_class *cl = (struct htb_class *)arg;
1494 
1495 	htb_deactivate(qdisc_priv(sch), cl);
1496 }
1497 
1498 static inline int htb_parent_last_child(struct htb_class *cl)
1499 {
1500 	if (!cl->parent)
1501 		/* the root class */
1502 		return 0;
1503 	if (cl->parent->children > 1)
1504 		/* not the last child */
1505 		return 0;
1506 	return 1;
1507 }
1508 
1509 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1510 			       struct Qdisc *new_q)
1511 {
1512 	struct htb_sched *q = qdisc_priv(sch);
1513 	struct htb_class *parent = cl->parent;
1514 
1515 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1516 
1517 	if (parent->cmode != HTB_CAN_SEND)
1518 		htb_safe_rb_erase(&parent->pq_node,
1519 				  &q->hlevel[parent->level].wait_pq);
1520 
1521 	parent->level = 0;
1522 	memset(&parent->inner, 0, sizeof(parent->inner));
1523 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1524 	parent->tokens = parent->buffer;
1525 	parent->ctokens = parent->cbuffer;
1526 	parent->t_c = ktime_get_ns();
1527 	parent->cmode = HTB_CAN_SEND;
1528 	if (q->offload)
1529 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1530 }
1531 
1532 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1533 				       struct netdev_queue *dev_queue,
1534 				       struct Qdisc *new_q)
1535 {
1536 	struct Qdisc *old_q;
1537 
1538 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1539 	if (new_q)
1540 		qdisc_refcount_inc(new_q);
1541 	old_q = htb_graft_helper(dev_queue, new_q);
1542 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1543 }
1544 
1545 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1546 				     bool last_child, bool destroying,
1547 				     struct netlink_ext_ack *extack)
1548 {
1549 	struct tc_htb_qopt_offload offload_opt;
1550 	struct netdev_queue *dev_queue;
1551 	struct Qdisc *q = cl->leaf.q;
1552 	struct Qdisc *old = NULL;
1553 	int err;
1554 
1555 	if (cl->level)
1556 		return -EINVAL;
1557 
1558 	WARN_ON(!q);
1559 	dev_queue = htb_offload_get_queue(cl);
1560 	old = htb_graft_helper(dev_queue, NULL);
1561 	if (destroying)
1562 		/* Before HTB is destroyed, the kernel grafts noop_qdisc to
1563 		 * all queues.
1564 		 */
1565 		WARN_ON(!(old->flags & TCQ_F_BUILTIN));
1566 	else
1567 		WARN_ON(old != q);
1568 
1569 	if (cl->parent) {
1570 		_bstats_update(&cl->parent->bstats_bias,
1571 			       u64_stats_read(&q->bstats.bytes),
1572 			       u64_stats_read(&q->bstats.packets));
1573 	}
1574 
1575 	offload_opt = (struct tc_htb_qopt_offload) {
1576 		.command = !last_child ? TC_HTB_LEAF_DEL :
1577 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1578 			   TC_HTB_LEAF_DEL_LAST,
1579 		.classid = cl->common.classid,
1580 		.extack = extack,
1581 	};
1582 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1583 
1584 	if (!err || destroying)
1585 		qdisc_put(old);
1586 	else
1587 		htb_graft_helper(dev_queue, old);
1588 
1589 	if (last_child)
1590 		return err;
1591 
1592 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1593 		u32 classid = TC_H_MAJ(sch->handle) |
1594 			      TC_H_MIN(offload_opt.classid);
1595 		struct htb_class *moved_cl = htb_find(classid, sch);
1596 
1597 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1598 	}
1599 
1600 	return err;
1601 }
1602 
1603 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1604 {
1605 	if (!cl->level) {
1606 		WARN_ON(!cl->leaf.q);
1607 		qdisc_put(cl->leaf.q);
1608 	}
1609 	gen_kill_estimator(&cl->rate_est);
1610 	tcf_block_put(cl->block);
1611 	kfree(cl);
1612 }
1613 
1614 static void htb_destroy(struct Qdisc *sch)
1615 {
1616 	struct net_device *dev = qdisc_dev(sch);
1617 	struct tc_htb_qopt_offload offload_opt;
1618 	struct htb_sched *q = qdisc_priv(sch);
1619 	struct hlist_node *next;
1620 	bool nonempty, changed;
1621 	struct htb_class *cl;
1622 	unsigned int i;
1623 
1624 	cancel_work_sync(&q->work);
1625 	qdisc_watchdog_cancel(&q->watchdog);
1626 	/* This line used to be after htb_destroy_class call below
1627 	 * and surprisingly it worked in 2.4. But it must precede it
1628 	 * because filter need its target class alive to be able to call
1629 	 * unbind_filter on it (without Oops).
1630 	 */
1631 	tcf_block_put(q->block);
1632 
1633 	for (i = 0; i < q->clhash.hashsize; i++) {
1634 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1635 			tcf_block_put(cl->block);
1636 			cl->block = NULL;
1637 		}
1638 	}
1639 
1640 	do {
1641 		nonempty = false;
1642 		changed = false;
1643 		for (i = 0; i < q->clhash.hashsize; i++) {
1644 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1645 						  common.hnode) {
1646 				bool last_child;
1647 
1648 				if (!q->offload) {
1649 					htb_destroy_class(sch, cl);
1650 					continue;
1651 				}
1652 
1653 				nonempty = true;
1654 
1655 				if (cl->level)
1656 					continue;
1657 
1658 				changed = true;
1659 
1660 				last_child = htb_parent_last_child(cl);
1661 				htb_destroy_class_offload(sch, cl, last_child,
1662 							  true, NULL);
1663 				qdisc_class_hash_remove(&q->clhash,
1664 							&cl->common);
1665 				if (cl->parent)
1666 					cl->parent->children--;
1667 				if (last_child)
1668 					htb_parent_to_leaf(sch, cl, NULL);
1669 				htb_destroy_class(sch, cl);
1670 			}
1671 		}
1672 	} while (changed);
1673 	WARN_ON(nonempty);
1674 
1675 	qdisc_class_hash_destroy(&q->clhash);
1676 	__qdisc_reset_queue(&q->direct_queue);
1677 
1678 	if (q->offload) {
1679 		offload_opt = (struct tc_htb_qopt_offload) {
1680 			.command = TC_HTB_DESTROY,
1681 		};
1682 		htb_offload(dev, &offload_opt);
1683 	}
1684 
1685 	if (!q->direct_qdiscs)
1686 		return;
1687 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1688 		qdisc_put(q->direct_qdiscs[i]);
1689 	kfree(q->direct_qdiscs);
1690 }
1691 
1692 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1693 		      struct netlink_ext_ack *extack)
1694 {
1695 	struct htb_sched *q = qdisc_priv(sch);
1696 	struct htb_class *cl = (struct htb_class *)arg;
1697 	struct Qdisc *new_q = NULL;
1698 	int last_child = 0;
1699 	int err;
1700 
1701 	/* TODO: why don't allow to delete subtree ? references ? does
1702 	 * tc subsys guarantee us that in htb_destroy it holds no class
1703 	 * refs so that we can remove children safely there ?
1704 	 */
1705 	if (cl->children || cl->filter_cnt)
1706 		return -EBUSY;
1707 
1708 	if (!cl->level && htb_parent_last_child(cl))
1709 		last_child = 1;
1710 
1711 	if (q->offload) {
1712 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1713 						extack);
1714 		if (err)
1715 			return err;
1716 	}
1717 
1718 	if (last_child) {
1719 		struct netdev_queue *dev_queue = sch->dev_queue;
1720 
1721 		if (q->offload)
1722 			dev_queue = htb_offload_get_queue(cl);
1723 
1724 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1725 					  cl->parent->common.classid,
1726 					  NULL);
1727 		if (q->offload) {
1728 			if (new_q)
1729 				htb_set_lockdep_class_child(new_q);
1730 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1731 		}
1732 	}
1733 
1734 	sch_tree_lock(sch);
1735 
1736 	if (!cl->level)
1737 		qdisc_purge_queue(cl->leaf.q);
1738 
1739 	/* delete from hash and active; remainder in destroy_class */
1740 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1741 	if (cl->parent)
1742 		cl->parent->children--;
1743 
1744 	if (cl->prio_activity)
1745 		htb_deactivate(q, cl);
1746 
1747 	if (cl->cmode != HTB_CAN_SEND)
1748 		htb_safe_rb_erase(&cl->pq_node,
1749 				  &q->hlevel[cl->level].wait_pq);
1750 
1751 	if (last_child)
1752 		htb_parent_to_leaf(sch, cl, new_q);
1753 
1754 	sch_tree_unlock(sch);
1755 
1756 	htb_destroy_class(sch, cl);
1757 	return 0;
1758 }
1759 
1760 static int htb_change_class(struct Qdisc *sch, u32 classid,
1761 			    u32 parentid, struct nlattr **tca,
1762 			    unsigned long *arg, struct netlink_ext_ack *extack)
1763 {
1764 	int err = -EINVAL;
1765 	struct htb_sched *q = qdisc_priv(sch);
1766 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1767 	struct tc_htb_qopt_offload offload_opt;
1768 	struct nlattr *opt = tca[TCA_OPTIONS];
1769 	struct nlattr *tb[TCA_HTB_MAX + 1];
1770 	struct Qdisc *parent_qdisc = NULL;
1771 	struct netdev_queue *dev_queue;
1772 	struct tc_htb_opt *hopt;
1773 	u64 rate64, ceil64;
1774 	int warn = 0;
1775 
1776 	/* extract all subattrs from opt attr */
1777 	if (!opt)
1778 		goto failure;
1779 
1780 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1781 					  NULL);
1782 	if (err < 0)
1783 		goto failure;
1784 
1785 	err = -EINVAL;
1786 	if (tb[TCA_HTB_PARMS] == NULL)
1787 		goto failure;
1788 
1789 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1790 
1791 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1792 	if (!hopt->rate.rate || !hopt->ceil.rate)
1793 		goto failure;
1794 
1795 	if (q->offload) {
1796 		/* Options not supported by the offload. */
1797 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1798 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1799 			goto failure;
1800 		}
1801 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1802 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1803 			goto failure;
1804 		}
1805 		if (hopt->quantum) {
1806 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter");
1807 			goto failure;
1808 		}
1809 		if (hopt->prio) {
1810 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter");
1811 			goto failure;
1812 		}
1813 	}
1814 
1815 	/* Keeping backward compatible with rate_table based iproute2 tc */
1816 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1817 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1818 					      NULL));
1819 
1820 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1821 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1822 					      NULL));
1823 
1824 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1825 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1826 
1827 	if (!cl) {		/* new class */
1828 		struct net_device *dev = qdisc_dev(sch);
1829 		struct Qdisc *new_q, *old_q;
1830 		int prio;
1831 		struct {
1832 			struct nlattr		nla;
1833 			struct gnet_estimator	opt;
1834 		} est = {
1835 			.nla = {
1836 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1837 				.nla_type	= TCA_RATE,
1838 			},
1839 			.opt = {
1840 				/* 4s interval, 16s averaging constant */
1841 				.interval	= 2,
1842 				.ewma_log	= 2,
1843 			},
1844 		};
1845 
1846 		/* check for valid classid */
1847 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1848 		    htb_find(classid, sch))
1849 			goto failure;
1850 
1851 		/* check maximal depth */
1852 		if (parent && parent->parent && parent->parent->level < 2) {
1853 			pr_err("htb: tree is too deep\n");
1854 			goto failure;
1855 		}
1856 		err = -ENOBUFS;
1857 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1858 		if (!cl)
1859 			goto failure;
1860 
1861 		gnet_stats_basic_sync_init(&cl->bstats);
1862 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1863 
1864 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1865 		if (err) {
1866 			kfree(cl);
1867 			goto failure;
1868 		}
1869 		if (htb_rate_est || tca[TCA_RATE]) {
1870 			err = gen_new_estimator(&cl->bstats, NULL,
1871 						&cl->rate_est,
1872 						NULL,
1873 						true,
1874 						tca[TCA_RATE] ? : &est.nla);
1875 			if (err)
1876 				goto err_block_put;
1877 		}
1878 
1879 		cl->children = 0;
1880 		RB_CLEAR_NODE(&cl->pq_node);
1881 
1882 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1883 			RB_CLEAR_NODE(&cl->node[prio]);
1884 
1885 		cl->common.classid = classid;
1886 
1887 		/* Make sure nothing interrupts us in between of two
1888 		 * ndo_setup_tc calls.
1889 		 */
1890 		ASSERT_RTNL();
1891 
1892 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1893 		 * so that can't be used inside of sch_tree_lock
1894 		 * -- thanks to Karlis Peisenieks
1895 		 */
1896 		if (!q->offload) {
1897 			dev_queue = sch->dev_queue;
1898 		} else if (!(parent && !parent->level)) {
1899 			/* Assign a dev_queue to this classid. */
1900 			offload_opt = (struct tc_htb_qopt_offload) {
1901 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1902 				.classid = cl->common.classid,
1903 				.parent_classid = parent ?
1904 					TC_H_MIN(parent->common.classid) :
1905 					TC_HTB_CLASSID_ROOT,
1906 				.rate = max_t(u64, hopt->rate.rate, rate64),
1907 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1908 				.extack = extack,
1909 			};
1910 			err = htb_offload(dev, &offload_opt);
1911 			if (err) {
1912 				pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n",
1913 				       err);
1914 				goto err_kill_estimator;
1915 			}
1916 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1917 		} else { /* First child. */
1918 			dev_queue = htb_offload_get_queue(parent);
1919 			old_q = htb_graft_helper(dev_queue, NULL);
1920 			WARN_ON(old_q != parent->leaf.q);
1921 			offload_opt = (struct tc_htb_qopt_offload) {
1922 				.command = TC_HTB_LEAF_TO_INNER,
1923 				.classid = cl->common.classid,
1924 				.parent_classid =
1925 					TC_H_MIN(parent->common.classid),
1926 				.rate = max_t(u64, hopt->rate.rate, rate64),
1927 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1928 				.extack = extack,
1929 			};
1930 			err = htb_offload(dev, &offload_opt);
1931 			if (err) {
1932 				pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n",
1933 				       err);
1934 				htb_graft_helper(dev_queue, old_q);
1935 				goto err_kill_estimator;
1936 			}
1937 			_bstats_update(&parent->bstats_bias,
1938 				       u64_stats_read(&old_q->bstats.bytes),
1939 				       u64_stats_read(&old_q->bstats.packets));
1940 			qdisc_put(old_q);
1941 		}
1942 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1943 					  classid, NULL);
1944 		if (q->offload) {
1945 			if (new_q) {
1946 				htb_set_lockdep_class_child(new_q);
1947 				/* One ref for cl->leaf.q, the other for
1948 				 * dev_queue->qdisc.
1949 				 */
1950 				qdisc_refcount_inc(new_q);
1951 			}
1952 			old_q = htb_graft_helper(dev_queue, new_q);
1953 			/* No qdisc_put needed. */
1954 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1955 		}
1956 		sch_tree_lock(sch);
1957 		if (parent && !parent->level) {
1958 			/* turn parent into inner node */
1959 			qdisc_purge_queue(parent->leaf.q);
1960 			parent_qdisc = parent->leaf.q;
1961 			if (parent->prio_activity)
1962 				htb_deactivate(q, parent);
1963 
1964 			/* remove from evt list because of level change */
1965 			if (parent->cmode != HTB_CAN_SEND) {
1966 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1967 				parent->cmode = HTB_CAN_SEND;
1968 			}
1969 			parent->level = (parent->parent ? parent->parent->level
1970 					 : TC_HTB_MAXDEPTH) - 1;
1971 			memset(&parent->inner, 0, sizeof(parent->inner));
1972 		}
1973 
1974 		/* leaf (we) needs elementary qdisc */
1975 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1976 		if (q->offload)
1977 			cl->leaf.offload_queue = dev_queue;
1978 
1979 		cl->parent = parent;
1980 
1981 		/* set class to be in HTB_CAN_SEND state */
1982 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1983 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1984 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1985 		cl->t_c = ktime_get_ns();
1986 		cl->cmode = HTB_CAN_SEND;
1987 
1988 		/* attach to the hash list and parent's family */
1989 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1990 		if (parent)
1991 			parent->children++;
1992 		if (cl->leaf.q != &noop_qdisc)
1993 			qdisc_hash_add(cl->leaf.q, true);
1994 	} else {
1995 		if (tca[TCA_RATE]) {
1996 			err = gen_replace_estimator(&cl->bstats, NULL,
1997 						    &cl->rate_est,
1998 						    NULL,
1999 						    true,
2000 						    tca[TCA_RATE]);
2001 			if (err)
2002 				return err;
2003 		}
2004 
2005 		if (q->offload) {
2006 			struct net_device *dev = qdisc_dev(sch);
2007 
2008 			offload_opt = (struct tc_htb_qopt_offload) {
2009 				.command = TC_HTB_NODE_MODIFY,
2010 				.classid = cl->common.classid,
2011 				.rate = max_t(u64, hopt->rate.rate, rate64),
2012 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2013 				.extack = extack,
2014 			};
2015 			err = htb_offload(dev, &offload_opt);
2016 			if (err)
2017 				/* Estimator was replaced, and rollback may fail
2018 				 * as well, so we don't try to recover it, and
2019 				 * the estimator won't work property with the
2020 				 * offload anyway, because bstats are updated
2021 				 * only when the stats are queried.
2022 				 */
2023 				return err;
2024 		}
2025 
2026 		sch_tree_lock(sch);
2027 	}
2028 
2029 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2030 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2031 
2032 	/* it used to be a nasty bug here, we have to check that node
2033 	 * is really leaf before changing cl->leaf !
2034 	 */
2035 	if (!cl->level) {
2036 		u64 quantum = cl->rate.rate_bytes_ps;
2037 
2038 		do_div(quantum, q->rate2quantum);
2039 		cl->quantum = min_t(u64, quantum, INT_MAX);
2040 
2041 		if (!hopt->quantum && cl->quantum < 1000) {
2042 			warn = -1;
2043 			cl->quantum = 1000;
2044 		}
2045 		if (!hopt->quantum && cl->quantum > 200000) {
2046 			warn = 1;
2047 			cl->quantum = 200000;
2048 		}
2049 		if (hopt->quantum)
2050 			cl->quantum = hopt->quantum;
2051 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2052 			cl->prio = TC_HTB_NUMPRIO - 1;
2053 	}
2054 
2055 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2056 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2057 
2058 	sch_tree_unlock(sch);
2059 	qdisc_put(parent_qdisc);
2060 
2061 	if (warn)
2062 		pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
2063 			    cl->common.classid, (warn == -1 ? "small" : "big"));
2064 
2065 	qdisc_class_hash_grow(sch, &q->clhash);
2066 
2067 	*arg = (unsigned long)cl;
2068 	return 0;
2069 
2070 err_kill_estimator:
2071 	gen_kill_estimator(&cl->rate_est);
2072 err_block_put:
2073 	tcf_block_put(cl->block);
2074 	kfree(cl);
2075 failure:
2076 	return err;
2077 }
2078 
2079 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2080 				       struct netlink_ext_ack *extack)
2081 {
2082 	struct htb_sched *q = qdisc_priv(sch);
2083 	struct htb_class *cl = (struct htb_class *)arg;
2084 
2085 	return cl ? cl->block : q->block;
2086 }
2087 
2088 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2089 				     u32 classid)
2090 {
2091 	struct htb_class *cl = htb_find(classid, sch);
2092 
2093 	/*if (cl && !cl->level) return 0;
2094 	 * The line above used to be there to prevent attaching filters to
2095 	 * leaves. But at least tc_index filter uses this just to get class
2096 	 * for other reasons so that we have to allow for it.
2097 	 * ----
2098 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2099 	 * another way to "lock" the class - unlike "get" this lock can
2100 	 * be broken by class during destroy IIUC.
2101 	 */
2102 	if (cl)
2103 		cl->filter_cnt++;
2104 	return (unsigned long)cl;
2105 }
2106 
2107 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2108 {
2109 	struct htb_class *cl = (struct htb_class *)arg;
2110 
2111 	if (cl)
2112 		cl->filter_cnt--;
2113 }
2114 
2115 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2116 {
2117 	struct htb_sched *q = qdisc_priv(sch);
2118 	struct htb_class *cl;
2119 	unsigned int i;
2120 
2121 	if (arg->stop)
2122 		return;
2123 
2124 	for (i = 0; i < q->clhash.hashsize; i++) {
2125 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2126 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2127 				return;
2128 		}
2129 	}
2130 }
2131 
2132 static const struct Qdisc_class_ops htb_class_ops = {
2133 	.select_queue	=	htb_select_queue,
2134 	.graft		=	htb_graft,
2135 	.leaf		=	htb_leaf,
2136 	.qlen_notify	=	htb_qlen_notify,
2137 	.find		=	htb_search,
2138 	.change		=	htb_change_class,
2139 	.delete		=	htb_delete,
2140 	.walk		=	htb_walk,
2141 	.tcf_block	=	htb_tcf_block,
2142 	.bind_tcf	=	htb_bind_filter,
2143 	.unbind_tcf	=	htb_unbind_filter,
2144 	.dump		=	htb_dump_class,
2145 	.dump_stats	=	htb_dump_class_stats,
2146 };
2147 
2148 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2149 	.cl_ops		=	&htb_class_ops,
2150 	.id		=	"htb",
2151 	.priv_size	=	sizeof(struct htb_sched),
2152 	.enqueue	=	htb_enqueue,
2153 	.dequeue	=	htb_dequeue,
2154 	.peek		=	qdisc_peek_dequeued,
2155 	.init		=	htb_init,
2156 	.attach		=	htb_attach,
2157 	.reset		=	htb_reset,
2158 	.destroy	=	htb_destroy,
2159 	.dump		=	htb_dump,
2160 	.owner		=	THIS_MODULE,
2161 };
2162 
2163 static int __init htb_module_init(void)
2164 {
2165 	return register_qdisc(&htb_qdisc_ops);
2166 }
2167 static void __exit htb_module_exit(void)
2168 {
2169 	unregister_qdisc(&htb_qdisc_ops);
2170 }
2171 
2172 module_init(htb_module_init)
2173 module_exit(htb_module_exit)
2174 MODULE_LICENSE("GPL");
2175