xref: /openbmc/linux/net/sched/sch_htb.c (revision 63186a89)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 	int			filter_cnt;
106 
107 	int			level;		/* our level (see above) */
108 	unsigned int		children;
109 	struct htb_class	*parent;	/* parent class */
110 
111 	struct net_rate_estimator __rcu *rate_est;
112 
113 	/*
114 	 * Written often fields
115 	 */
116 	struct gnet_stats_basic_sync bstats;
117 	struct gnet_stats_basic_sync bstats_bias;
118 	struct tc_htb_xstats	xstats;	/* our special stats */
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 
203 #define HTB_DIRECT ((struct htb_class *)-1L)
204 
205 /**
206  * htb_classify - classify a packet into class
207  * @skb: the socket buffer
208  * @sch: the active queue discipline
209  * @qerr: pointer for returned status code
210  *
211  * It returns NULL if the packet should be dropped or -1 if the packet
212  * should be passed directly thru. In all other cases leaf class is returned.
213  * We allow direct class selection by classid in priority. The we examine
214  * filters in qdisc and in inner nodes (if higher filter points to the inner
215  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
216  * internal fifo (direct). These packets then go directly thru. If we still
217  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
218  * then finish and return direct queue.
219  */
220 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
221 				      int *qerr)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct htb_class *cl;
225 	struct tcf_result res;
226 	struct tcf_proto *tcf;
227 	int result;
228 
229 	/* allow to select class by setting skb->priority to valid classid;
230 	 * note that nfmark can be used too by attaching filter fw with no
231 	 * rules in it
232 	 */
233 	if (skb->priority == sch->handle)
234 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
235 	cl = htb_find(skb->priority, sch);
236 	if (cl) {
237 		if (cl->level == 0)
238 			return cl;
239 		/* Start with inner filter chain if a non-leaf class is selected */
240 		tcf = rcu_dereference_bh(cl->filter_list);
241 	} else {
242 		tcf = rcu_dereference_bh(q->filter_list);
243 	}
244 
245 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
246 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		}
257 #endif
258 		cl = (void *)res.class;
259 		if (!cl) {
260 			if (res.classid == sch->handle)
261 				return HTB_DIRECT;	/* X:0 (direct flow) */
262 			cl = htb_find(res.classid, sch);
263 			if (!cl)
264 				break;	/* filter selected invalid classid */
265 		}
266 		if (!cl->level)
267 			return cl;	/* we hit leaf; return it */
268 
269 		/* we have got inner class; apply inner filter chain */
270 		tcf = rcu_dereference_bh(cl->filter_list);
271 	}
272 	/* classification failed; try to use default class */
273 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
274 	if (!cl || cl->level)
275 		return HTB_DIRECT;	/* bad default .. this is safe bet */
276 	return cl;
277 }
278 
279 /**
280  * htb_add_to_id_tree - adds class to the round robin list
281  * @root: the root of the tree
282  * @cl: the class to add
283  * @prio: the give prio in class
284  *
285  * Routine adds class to the list (actually tree) sorted by classid.
286  * Make sure that class is not already on such list for given prio.
287  */
288 static void htb_add_to_id_tree(struct rb_root *root,
289 			       struct htb_class *cl, int prio)
290 {
291 	struct rb_node **p = &root->rb_node, *parent = NULL;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, node[prio]);
297 
298 		if (cl->common.classid > c->common.classid)
299 			p = &parent->rb_right;
300 		else
301 			p = &parent->rb_left;
302 	}
303 	rb_link_node(&cl->node[prio], parent, p);
304 	rb_insert_color(&cl->node[prio], root);
305 }
306 
307 /**
308  * htb_add_to_wait_tree - adds class to the event queue with delay
309  * @q: the priority event queue
310  * @cl: the class to add
311  * @delay: delay in microseconds
312  *
313  * The class is added to priority event queue to indicate that class will
314  * change its mode in cl->pq_key microseconds. Make sure that class is not
315  * already in the queue.
316  */
317 static void htb_add_to_wait_tree(struct htb_sched *q,
318 				 struct htb_class *cl, s64 delay)
319 {
320 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
321 
322 	cl->pq_key = q->now + delay;
323 	if (cl->pq_key == q->now)
324 		cl->pq_key++;
325 
326 	/* update the nearest event cache */
327 	if (q->near_ev_cache[cl->level] > cl->pq_key)
328 		q->near_ev_cache[cl->level] = cl->pq_key;
329 
330 	while (*p) {
331 		struct htb_class *c;
332 		parent = *p;
333 		c = rb_entry(parent, struct htb_class, pq_node);
334 		if (cl->pq_key >= c->pq_key)
335 			p = &parent->rb_right;
336 		else
337 			p = &parent->rb_left;
338 	}
339 	rb_link_node(&cl->pq_node, parent, p);
340 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
341 }
342 
343 /**
344  * htb_next_rb_node - finds next node in binary tree
345  * @n: the current node in binary tree
346  *
347  * When we are past last key we return NULL.
348  * Average complexity is 2 steps per call.
349  */
350 static inline void htb_next_rb_node(struct rb_node **n)
351 {
352 	*n = rb_next(*n);
353 }
354 
355 /**
356  * htb_add_class_to_row - add class to its row
357  * @q: the priority event queue
358  * @cl: the class to add
359  * @mask: the given priorities in class in bitmap
360  *
361  * The class is added to row at priorities marked in mask.
362  * It does nothing if mask == 0.
363  */
364 static inline void htb_add_class_to_row(struct htb_sched *q,
365 					struct htb_class *cl, int mask)
366 {
367 	q->row_mask[cl->level] |= mask;
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		mask &= ~(1 << prio);
371 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
372 	}
373 }
374 
375 /* If this triggers, it is a bug in this code, but it need not be fatal */
376 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
377 {
378 	if (RB_EMPTY_NODE(rb)) {
379 		WARN_ON(1);
380 	} else {
381 		rb_erase(rb, root);
382 		RB_CLEAR_NODE(rb);
383 	}
384 }
385 
386 
387 /**
388  * htb_remove_class_from_row - removes class from its row
389  * @q: the priority event queue
390  * @cl: the class to add
391  * @mask: the given priorities in class in bitmap
392  *
393  * The class is removed from row at priorities marked in mask.
394  * It does nothing if mask == 0.
395  */
396 static inline void htb_remove_class_from_row(struct htb_sched *q,
397 						 struct htb_class *cl, int mask)
398 {
399 	int m = 0;
400 	struct htb_level *hlevel = &q->hlevel[cl->level];
401 
402 	while (mask) {
403 		int prio = ffz(~mask);
404 		struct htb_prio *hprio = &hlevel->hprio[prio];
405 
406 		mask &= ~(1 << prio);
407 		if (hprio->ptr == cl->node + prio)
408 			htb_next_rb_node(&hprio->ptr);
409 
410 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
411 		if (!hprio->row.rb_node)
412 			m |= 1 << prio;
413 	}
414 	q->row_mask[cl->level] &= ~m;
415 }
416 
417 /**
418  * htb_activate_prios - creates active classe's feed chain
419  * @q: the priority event queue
420  * @cl: the class to activate
421  *
422  * The class is connected to ancestors and/or appropriate rows
423  * for priorities it is participating on. cl->cmode must be new
424  * (activated) mode. It does nothing if cl->prio_activity == 0.
425  */
426 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
427 {
428 	struct htb_class *p = cl->parent;
429 	long m, mask = cl->prio_activity;
430 
431 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 		m = mask;
433 		while (m) {
434 			unsigned int prio = ffz(~m);
435 
436 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
437 				break;
438 			m &= ~(1 << prio);
439 
440 			if (p->inner.clprio[prio].feed.rb_node)
441 				/* parent already has its feed in use so that
442 				 * reset bit in mask as parent is already ok
443 				 */
444 				mask &= ~(1 << prio);
445 
446 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
447 		}
448 		p->prio_activity |= mask;
449 		cl = p;
450 		p = cl->parent;
451 
452 	}
453 	if (cl->cmode == HTB_CAN_SEND && mask)
454 		htb_add_class_to_row(q, cl, mask);
455 }
456 
457 /**
458  * htb_deactivate_prios - remove class from feed chain
459  * @q: the priority event queue
460  * @cl: the class to deactivate
461  *
462  * cl->cmode must represent old mode (before deactivation). It does
463  * nothing if cl->prio_activity == 0. Class is removed from all feed
464  * chains and rows.
465  */
466 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
467 {
468 	struct htb_class *p = cl->parent;
469 	long m, mask = cl->prio_activity;
470 
471 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
472 		m = mask;
473 		mask = 0;
474 		while (m) {
475 			int prio = ffz(~m);
476 			m &= ~(1 << prio);
477 
478 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
479 				/* we are removing child which is pointed to from
480 				 * parent feed - forget the pointer but remember
481 				 * classid
482 				 */
483 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
484 				p->inner.clprio[prio].ptr = NULL;
485 			}
486 
487 			htb_safe_rb_erase(cl->node + prio,
488 					  &p->inner.clprio[prio].feed);
489 
490 			if (!p->inner.clprio[prio].feed.rb_node)
491 				mask |= 1 << prio;
492 		}
493 
494 		p->prio_activity &= ~mask;
495 		cl = p;
496 		p = cl->parent;
497 
498 	}
499 	if (cl->cmode == HTB_CAN_SEND && mask)
500 		htb_remove_class_from_row(q, cl, mask);
501 }
502 
503 static inline s64 htb_lowater(const struct htb_class *cl)
504 {
505 	if (htb_hysteresis)
506 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
507 	else
508 		return 0;
509 }
510 static inline s64 htb_hiwater(const struct htb_class *cl)
511 {
512 	if (htb_hysteresis)
513 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
514 	else
515 		return 0;
516 }
517 
518 
519 /**
520  * htb_class_mode - computes and returns current class mode
521  * @cl: the target class
522  * @diff: diff time in microseconds
523  *
524  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
525  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
526  * from now to time when cl will change its state.
527  * Also it is worth to note that class mode doesn't change simply
528  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
529  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
530  * mode transitions per time unit. The speed gain is about 1/6.
531  */
532 static inline enum htb_cmode
533 htb_class_mode(struct htb_class *cl, s64 *diff)
534 {
535 	s64 toks;
536 
537 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
538 		*diff = -toks;
539 		return HTB_CANT_SEND;
540 	}
541 
542 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
543 		return HTB_CAN_SEND;
544 
545 	*diff = -toks;
546 	return HTB_MAY_BORROW;
547 }
548 
549 /**
550  * htb_change_class_mode - changes classe's mode
551  * @q: the priority event queue
552  * @cl: the target class
553  * @diff: diff time in microseconds
554  *
555  * This should be the only way how to change classe's mode under normal
556  * circumstances. Routine will update feed lists linkage, change mode
557  * and add class to the wait event queue if appropriate. New mode should
558  * be different from old one and cl->pq_key has to be valid if changing
559  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
560  */
561 static void
562 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
563 {
564 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
565 
566 	if (new_mode == cl->cmode)
567 		return;
568 
569 	if (new_mode == HTB_CANT_SEND) {
570 		cl->overlimits++;
571 		q->overlimits++;
572 	}
573 
574 	if (cl->prio_activity) {	/* not necessary: speed optimization */
575 		if (cl->cmode != HTB_CANT_SEND)
576 			htb_deactivate_prios(q, cl);
577 		cl->cmode = new_mode;
578 		if (new_mode != HTB_CANT_SEND)
579 			htb_activate_prios(q, cl);
580 	} else
581 		cl->cmode = new_mode;
582 }
583 
584 /**
585  * htb_activate - inserts leaf cl into appropriate active feeds
586  * @q: the priority event queue
587  * @cl: the target class
588  *
589  * Routine learns (new) priority of leaf and activates feed chain
590  * for the prio. It can be called on already active leaf safely.
591  * It also adds leaf into droplist.
592  */
593 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
594 {
595 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
596 
597 	if (!cl->prio_activity) {
598 		cl->prio_activity = 1 << cl->prio;
599 		htb_activate_prios(q, cl);
600 	}
601 }
602 
603 /**
604  * htb_deactivate - remove leaf cl from active feeds
605  * @q: the priority event queue
606  * @cl: the target class
607  *
608  * Make sure that leaf is active. In the other words it can't be called
609  * with non-active leaf. It also removes class from the drop list.
610  */
611 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
612 {
613 	WARN_ON(!cl->prio_activity);
614 
615 	htb_deactivate_prios(q, cl);
616 	cl->prio_activity = 0;
617 }
618 
619 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
620 		       struct sk_buff **to_free)
621 {
622 	int ret;
623 	unsigned int len = qdisc_pkt_len(skb);
624 	struct htb_sched *q = qdisc_priv(sch);
625 	struct htb_class *cl = htb_classify(skb, sch, &ret);
626 
627 	if (cl == HTB_DIRECT) {
628 		/* enqueue to helper queue */
629 		if (q->direct_queue.qlen < q->direct_qlen) {
630 			__qdisc_enqueue_tail(skb, &q->direct_queue);
631 			q->direct_pkts++;
632 		} else {
633 			return qdisc_drop(skb, sch, to_free);
634 		}
635 #ifdef CONFIG_NET_CLS_ACT
636 	} else if (!cl) {
637 		if (ret & __NET_XMIT_BYPASS)
638 			qdisc_qstats_drop(sch);
639 		__qdisc_drop(skb, to_free);
640 		return ret;
641 #endif
642 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
643 					to_free)) != NET_XMIT_SUCCESS) {
644 		if (net_xmit_drop_count(ret)) {
645 			qdisc_qstats_drop(sch);
646 			cl->drops++;
647 		}
648 		return ret;
649 	} else {
650 		htb_activate(q, cl);
651 	}
652 
653 	sch->qstats.backlog += len;
654 	sch->q.qlen++;
655 	return NET_XMIT_SUCCESS;
656 }
657 
658 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
659 {
660 	s64 toks = diff + cl->tokens;
661 
662 	if (toks > cl->buffer)
663 		toks = cl->buffer;
664 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
665 	if (toks <= -cl->mbuffer)
666 		toks = 1 - cl->mbuffer;
667 
668 	cl->tokens = toks;
669 }
670 
671 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
672 {
673 	s64 toks = diff + cl->ctokens;
674 
675 	if (toks > cl->cbuffer)
676 		toks = cl->cbuffer;
677 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
678 	if (toks <= -cl->mbuffer)
679 		toks = 1 - cl->mbuffer;
680 
681 	cl->ctokens = toks;
682 }
683 
684 /**
685  * htb_charge_class - charges amount "bytes" to leaf and ancestors
686  * @q: the priority event queue
687  * @cl: the class to start iterate
688  * @level: the minimum level to account
689  * @skb: the socket buffer
690  *
691  * Routine assumes that packet "bytes" long was dequeued from leaf cl
692  * borrowing from "level". It accounts bytes to ceil leaky bucket for
693  * leaf and all ancestors and to rate bucket for ancestors at levels
694  * "level" and higher. It also handles possible change of mode resulting
695  * from the update. Note that mode can also increase here (MAY_BORROW to
696  * CAN_SEND) because we can use more precise clock that event queue here.
697  * In such case we remove class from event queue first.
698  */
699 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
700 			     int level, struct sk_buff *skb)
701 {
702 	int bytes = qdisc_pkt_len(skb);
703 	enum htb_cmode old_mode;
704 	s64 diff;
705 
706 	while (cl) {
707 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
708 		if (cl->level >= level) {
709 			if (cl->level == level)
710 				cl->xstats.lends++;
711 			htb_accnt_tokens(cl, bytes, diff);
712 		} else {
713 			cl->xstats.borrows++;
714 			cl->tokens += diff;	/* we moved t_c; update tokens */
715 		}
716 		htb_accnt_ctokens(cl, bytes, diff);
717 		cl->t_c = q->now;
718 
719 		old_mode = cl->cmode;
720 		diff = 0;
721 		htb_change_class_mode(q, cl, &diff);
722 		if (old_mode != cl->cmode) {
723 			if (old_mode != HTB_CAN_SEND)
724 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
725 			if (cl->cmode != HTB_CAN_SEND)
726 				htb_add_to_wait_tree(q, cl, diff);
727 		}
728 
729 		/* update basic stats except for leaves which are already updated */
730 		if (cl->level)
731 			bstats_update(&cl->bstats, skb);
732 
733 		cl = cl->parent;
734 	}
735 }
736 
737 /**
738  * htb_do_events - make mode changes to classes at the level
739  * @q: the priority event queue
740  * @level: which wait_pq in 'q->hlevel'
741  * @start: start jiffies
742  *
743  * Scans event queue for pending events and applies them. Returns time of
744  * next pending event (0 for no event in pq, q->now for too many events).
745  * Note: Applied are events whose have cl->pq_key <= q->now.
746  */
747 static s64 htb_do_events(struct htb_sched *q, const int level,
748 			 unsigned long start)
749 {
750 	/* don't run for longer than 2 jiffies; 2 is used instead of
751 	 * 1 to simplify things when jiffy is going to be incremented
752 	 * too soon
753 	 */
754 	unsigned long stop_at = start + 2;
755 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
756 
757 	while (time_before(jiffies, stop_at)) {
758 		struct htb_class *cl;
759 		s64 diff;
760 		struct rb_node *p = rb_first(wait_pq);
761 
762 		if (!p)
763 			return 0;
764 
765 		cl = rb_entry(p, struct htb_class, pq_node);
766 		if (cl->pq_key > q->now)
767 			return cl->pq_key;
768 
769 		htb_safe_rb_erase(p, wait_pq);
770 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
771 		htb_change_class_mode(q, cl, &diff);
772 		if (cl->cmode != HTB_CAN_SEND)
773 			htb_add_to_wait_tree(q, cl, diff);
774 	}
775 
776 	/* too much load - let's continue after a break for scheduling */
777 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
778 		pr_warn("htb: too many events!\n");
779 		q->warned |= HTB_WARN_TOOMANYEVENTS;
780 	}
781 
782 	return q->now;
783 }
784 
785 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
786  * is no such one exists.
787  */
788 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
789 					      u32 id)
790 {
791 	struct rb_node *r = NULL;
792 	while (n) {
793 		struct htb_class *cl =
794 		    rb_entry(n, struct htb_class, node[prio]);
795 
796 		if (id > cl->common.classid) {
797 			n = n->rb_right;
798 		} else if (id < cl->common.classid) {
799 			r = n;
800 			n = n->rb_left;
801 		} else {
802 			return n;
803 		}
804 	}
805 	return r;
806 }
807 
808 /**
809  * htb_lookup_leaf - returns next leaf class in DRR order
810  * @hprio: the current one
811  * @prio: which prio in class
812  *
813  * Find leaf where current feed pointers points to.
814  */
815 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
816 {
817 	int i;
818 	struct {
819 		struct rb_node *root;
820 		struct rb_node **pptr;
821 		u32 *pid;
822 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
823 
824 	BUG_ON(!hprio->row.rb_node);
825 	sp->root = hprio->row.rb_node;
826 	sp->pptr = &hprio->ptr;
827 	sp->pid = &hprio->last_ptr_id;
828 
829 	for (i = 0; i < 65535; i++) {
830 		if (!*sp->pptr && *sp->pid) {
831 			/* ptr was invalidated but id is valid - try to recover
832 			 * the original or next ptr
833 			 */
834 			*sp->pptr =
835 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
836 		}
837 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
838 				 * can become out of date quickly
839 				 */
840 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
841 			*sp->pptr = sp->root;
842 			while ((*sp->pptr)->rb_left)
843 				*sp->pptr = (*sp->pptr)->rb_left;
844 			if (sp > stk) {
845 				sp--;
846 				if (!*sp->pptr) {
847 					WARN_ON(1);
848 					return NULL;
849 				}
850 				htb_next_rb_node(sp->pptr);
851 			}
852 		} else {
853 			struct htb_class *cl;
854 			struct htb_prio *clp;
855 
856 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
857 			if (!cl->level)
858 				return cl;
859 			clp = &cl->inner.clprio[prio];
860 			(++sp)->root = clp->feed.rb_node;
861 			sp->pptr = &clp->ptr;
862 			sp->pid = &clp->last_ptr_id;
863 		}
864 	}
865 	WARN_ON(1);
866 	return NULL;
867 }
868 
869 /* dequeues packet at given priority and level; call only if
870  * you are sure that there is active class at prio/level
871  */
872 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
873 					const int level)
874 {
875 	struct sk_buff *skb = NULL;
876 	struct htb_class *cl, *start;
877 	struct htb_level *hlevel = &q->hlevel[level];
878 	struct htb_prio *hprio = &hlevel->hprio[prio];
879 
880 	/* look initial class up in the row */
881 	start = cl = htb_lookup_leaf(hprio, prio);
882 
883 	do {
884 next:
885 		if (unlikely(!cl))
886 			return NULL;
887 
888 		/* class can be empty - it is unlikely but can be true if leaf
889 		 * qdisc drops packets in enqueue routine or if someone used
890 		 * graft operation on the leaf since last dequeue;
891 		 * simply deactivate and skip such class
892 		 */
893 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
894 			struct htb_class *next;
895 			htb_deactivate(q, cl);
896 
897 			/* row/level might become empty */
898 			if ((q->row_mask[level] & (1 << prio)) == 0)
899 				return NULL;
900 
901 			next = htb_lookup_leaf(hprio, prio);
902 
903 			if (cl == start)	/* fix start if we just deleted it */
904 				start = next;
905 			cl = next;
906 			goto next;
907 		}
908 
909 		skb = cl->leaf.q->dequeue(cl->leaf.q);
910 		if (likely(skb != NULL))
911 			break;
912 
913 		qdisc_warn_nonwc("htb", cl->leaf.q);
914 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
915 					 &q->hlevel[0].hprio[prio].ptr);
916 		cl = htb_lookup_leaf(hprio, prio);
917 
918 	} while (cl != start);
919 
920 	if (likely(skb != NULL)) {
921 		bstats_update(&cl->bstats, skb);
922 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
923 		if (cl->leaf.deficit[level] < 0) {
924 			cl->leaf.deficit[level] += cl->quantum;
925 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
926 						 &q->hlevel[0].hprio[prio].ptr);
927 		}
928 		/* this used to be after charge_class but this constelation
929 		 * gives us slightly better performance
930 		 */
931 		if (!cl->leaf.q->q.qlen)
932 			htb_deactivate(q, cl);
933 		htb_charge_class(q, cl, level, skb);
934 	}
935 	return skb;
936 }
937 
938 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
939 {
940 	struct sk_buff *skb;
941 	struct htb_sched *q = qdisc_priv(sch);
942 	int level;
943 	s64 next_event;
944 	unsigned long start_at;
945 
946 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
947 	skb = __qdisc_dequeue_head(&q->direct_queue);
948 	if (skb != NULL) {
949 ok:
950 		qdisc_bstats_update(sch, skb);
951 		qdisc_qstats_backlog_dec(sch, skb);
952 		sch->q.qlen--;
953 		return skb;
954 	}
955 
956 	if (!sch->q.qlen)
957 		goto fin;
958 	q->now = ktime_get_ns();
959 	start_at = jiffies;
960 
961 	next_event = q->now + 5LLU * NSEC_PER_SEC;
962 
963 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
964 		/* common case optimization - skip event handler quickly */
965 		int m;
966 		s64 event = q->near_ev_cache[level];
967 
968 		if (q->now >= event) {
969 			event = htb_do_events(q, level, start_at);
970 			if (!event)
971 				event = q->now + NSEC_PER_SEC;
972 			q->near_ev_cache[level] = event;
973 		}
974 
975 		if (next_event > event)
976 			next_event = event;
977 
978 		m = ~q->row_mask[level];
979 		while (m != (int)(-1)) {
980 			int prio = ffz(m);
981 
982 			m |= 1 << prio;
983 			skb = htb_dequeue_tree(q, prio, level);
984 			if (likely(skb != NULL))
985 				goto ok;
986 		}
987 	}
988 	if (likely(next_event > q->now))
989 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
990 	else
991 		schedule_work(&q->work);
992 fin:
993 	return skb;
994 }
995 
996 /* reset all classes */
997 /* always caled under BH & queue lock */
998 static void htb_reset(struct Qdisc *sch)
999 {
1000 	struct htb_sched *q = qdisc_priv(sch);
1001 	struct htb_class *cl;
1002 	unsigned int i;
1003 
1004 	for (i = 0; i < q->clhash.hashsize; i++) {
1005 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1006 			if (cl->level)
1007 				memset(&cl->inner, 0, sizeof(cl->inner));
1008 			else {
1009 				if (cl->leaf.q && !q->offload)
1010 					qdisc_reset(cl->leaf.q);
1011 			}
1012 			cl->prio_activity = 0;
1013 			cl->cmode = HTB_CAN_SEND;
1014 		}
1015 	}
1016 	qdisc_watchdog_cancel(&q->watchdog);
1017 	__qdisc_reset_queue(&q->direct_queue);
1018 	memset(q->hlevel, 0, sizeof(q->hlevel));
1019 	memset(q->row_mask, 0, sizeof(q->row_mask));
1020 }
1021 
1022 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1023 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1024 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1025 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1026 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1027 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1028 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1029 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1030 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1031 };
1032 
1033 static void htb_work_func(struct work_struct *work)
1034 {
1035 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1036 	struct Qdisc *sch = q->watchdog.qdisc;
1037 
1038 	rcu_read_lock();
1039 	__netif_schedule(qdisc_root(sch));
1040 	rcu_read_unlock();
1041 }
1042 
1043 static void htb_set_lockdep_class_child(struct Qdisc *q)
1044 {
1045 	static struct lock_class_key child_key;
1046 
1047 	lockdep_set_class(qdisc_lock(q), &child_key);
1048 }
1049 
1050 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1051 {
1052 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1053 }
1054 
1055 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1056 		    struct netlink_ext_ack *extack)
1057 {
1058 	struct net_device *dev = qdisc_dev(sch);
1059 	struct tc_htb_qopt_offload offload_opt;
1060 	struct htb_sched *q = qdisc_priv(sch);
1061 	struct nlattr *tb[TCA_HTB_MAX + 1];
1062 	struct tc_htb_glob *gopt;
1063 	unsigned int ntx;
1064 	bool offload;
1065 	int err;
1066 
1067 	qdisc_watchdog_init(&q->watchdog, sch);
1068 	INIT_WORK(&q->work, htb_work_func);
1069 
1070 	if (!opt)
1071 		return -EINVAL;
1072 
1073 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1074 	if (err)
1075 		return err;
1076 
1077 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1078 					  NULL);
1079 	if (err < 0)
1080 		return err;
1081 
1082 	if (!tb[TCA_HTB_INIT])
1083 		return -EINVAL;
1084 
1085 	gopt = nla_data(tb[TCA_HTB_INIT]);
1086 	if (gopt->version != HTB_VER >> 16)
1087 		return -EINVAL;
1088 
1089 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1090 
1091 	if (offload) {
1092 		if (sch->parent != TC_H_ROOT) {
1093 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1094 			return -EOPNOTSUPP;
1095 		}
1096 
1097 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1098 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1099 			return -EOPNOTSUPP;
1100 		}
1101 
1102 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1103 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1104 					   sizeof(*q->direct_qdiscs),
1105 					   GFP_KERNEL);
1106 		if (!q->direct_qdiscs)
1107 			return -ENOMEM;
1108 	}
1109 
1110 	err = qdisc_class_hash_init(&q->clhash);
1111 	if (err < 0)
1112 		return err;
1113 
1114 	if (tb[TCA_HTB_DIRECT_QLEN])
1115 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1116 	else
1117 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1118 
1119 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1120 		q->rate2quantum = 1;
1121 	q->defcls = gopt->defcls;
1122 
1123 	if (!offload)
1124 		return 0;
1125 
1126 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1127 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1128 		struct Qdisc *qdisc;
1129 
1130 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1131 					  TC_H_MAKE(sch->handle, 0), extack);
1132 		if (!qdisc) {
1133 			return -ENOMEM;
1134 		}
1135 
1136 		htb_set_lockdep_class_child(qdisc);
1137 		q->direct_qdiscs[ntx] = qdisc;
1138 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1139 	}
1140 
1141 	sch->flags |= TCQ_F_MQROOT;
1142 
1143 	offload_opt = (struct tc_htb_qopt_offload) {
1144 		.command = TC_HTB_CREATE,
1145 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1146 		.classid = TC_H_MIN(q->defcls),
1147 		.extack = extack,
1148 	};
1149 	err = htb_offload(dev, &offload_opt);
1150 	if (err)
1151 		return err;
1152 
1153 	/* Defer this assignment, so that htb_destroy skips offload-related
1154 	 * parts (especially calling ndo_setup_tc) on errors.
1155 	 */
1156 	q->offload = true;
1157 
1158 	return 0;
1159 }
1160 
1161 static void htb_attach_offload(struct Qdisc *sch)
1162 {
1163 	struct net_device *dev = qdisc_dev(sch);
1164 	struct htb_sched *q = qdisc_priv(sch);
1165 	unsigned int ntx;
1166 
1167 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1168 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1169 
1170 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1171 		qdisc_put(old);
1172 		qdisc_hash_add(qdisc, false);
1173 	}
1174 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1175 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1176 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1177 
1178 		qdisc_put(old);
1179 	}
1180 
1181 	kfree(q->direct_qdiscs);
1182 	q->direct_qdiscs = NULL;
1183 }
1184 
1185 static void htb_attach_software(struct Qdisc *sch)
1186 {
1187 	struct net_device *dev = qdisc_dev(sch);
1188 	unsigned int ntx;
1189 
1190 	/* Resemble qdisc_graft behavior. */
1191 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1192 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1193 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1194 
1195 		qdisc_refcount_inc(sch);
1196 
1197 		qdisc_put(old);
1198 	}
1199 }
1200 
1201 static void htb_attach(struct Qdisc *sch)
1202 {
1203 	struct htb_sched *q = qdisc_priv(sch);
1204 
1205 	if (q->offload)
1206 		htb_attach_offload(sch);
1207 	else
1208 		htb_attach_software(sch);
1209 }
1210 
1211 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1212 {
1213 	struct htb_sched *q = qdisc_priv(sch);
1214 	struct nlattr *nest;
1215 	struct tc_htb_glob gopt;
1216 
1217 	if (q->offload)
1218 		sch->flags |= TCQ_F_OFFLOADED;
1219 	else
1220 		sch->flags &= ~TCQ_F_OFFLOADED;
1221 
1222 	sch->qstats.overlimits = q->overlimits;
1223 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1224 	 * no change can happen on the qdisc parameters.
1225 	 */
1226 
1227 	gopt.direct_pkts = q->direct_pkts;
1228 	gopt.version = HTB_VER;
1229 	gopt.rate2quantum = q->rate2quantum;
1230 	gopt.defcls = q->defcls;
1231 	gopt.debug = 0;
1232 
1233 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1234 	if (nest == NULL)
1235 		goto nla_put_failure;
1236 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1237 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1238 		goto nla_put_failure;
1239 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1240 		goto nla_put_failure;
1241 
1242 	return nla_nest_end(skb, nest);
1243 
1244 nla_put_failure:
1245 	nla_nest_cancel(skb, nest);
1246 	return -1;
1247 }
1248 
1249 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1250 			  struct sk_buff *skb, struct tcmsg *tcm)
1251 {
1252 	struct htb_class *cl = (struct htb_class *)arg;
1253 	struct htb_sched *q = qdisc_priv(sch);
1254 	struct nlattr *nest;
1255 	struct tc_htb_opt opt;
1256 
1257 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1258 	 * no change can happen on the class parameters.
1259 	 */
1260 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1261 	tcm->tcm_handle = cl->common.classid;
1262 	if (!cl->level && cl->leaf.q)
1263 		tcm->tcm_info = cl->leaf.q->handle;
1264 
1265 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1266 	if (nest == NULL)
1267 		goto nla_put_failure;
1268 
1269 	memset(&opt, 0, sizeof(opt));
1270 
1271 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1272 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1273 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1274 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1275 	opt.quantum = cl->quantum;
1276 	opt.prio = cl->prio;
1277 	opt.level = cl->level;
1278 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1279 		goto nla_put_failure;
1280 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1281 		goto nla_put_failure;
1282 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1283 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1284 			      TCA_HTB_PAD))
1285 		goto nla_put_failure;
1286 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1287 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1288 			      TCA_HTB_PAD))
1289 		goto nla_put_failure;
1290 
1291 	return nla_nest_end(skb, nest);
1292 
1293 nla_put_failure:
1294 	nla_nest_cancel(skb, nest);
1295 	return -1;
1296 }
1297 
1298 static void htb_offload_aggregate_stats(struct htb_sched *q,
1299 					struct htb_class *cl)
1300 {
1301 	u64 bytes = 0, packets = 0;
1302 	struct htb_class *c;
1303 	unsigned int i;
1304 
1305 	gnet_stats_basic_sync_init(&cl->bstats);
1306 
1307 	for (i = 0; i < q->clhash.hashsize; i++) {
1308 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1309 			struct htb_class *p = c;
1310 
1311 			while (p && p->level < cl->level)
1312 				p = p->parent;
1313 
1314 			if (p != cl)
1315 				continue;
1316 
1317 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1318 			packets += u64_stats_read(&c->bstats_bias.packets);
1319 			if (c->level == 0) {
1320 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1321 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1322 			}
1323 		}
1324 	}
1325 	_bstats_update(&cl->bstats, bytes, packets);
1326 }
1327 
1328 static int
1329 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1330 {
1331 	struct htb_class *cl = (struct htb_class *)arg;
1332 	struct htb_sched *q = qdisc_priv(sch);
1333 	struct gnet_stats_queue qs = {
1334 		.drops = cl->drops,
1335 		.overlimits = cl->overlimits,
1336 	};
1337 	__u32 qlen = 0;
1338 
1339 	if (!cl->level && cl->leaf.q)
1340 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1341 
1342 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1343 				    INT_MIN, INT_MAX);
1344 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1345 				     INT_MIN, INT_MAX);
1346 
1347 	if (q->offload) {
1348 		if (!cl->level) {
1349 			if (cl->leaf.q)
1350 				cl->bstats = cl->leaf.q->bstats;
1351 			else
1352 				gnet_stats_basic_sync_init(&cl->bstats);
1353 			_bstats_update(&cl->bstats,
1354 				       u64_stats_read(&cl->bstats_bias.bytes),
1355 				       u64_stats_read(&cl->bstats_bias.packets));
1356 		} else {
1357 			htb_offload_aggregate_stats(q, cl);
1358 		}
1359 	}
1360 
1361 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1362 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1363 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1364 		return -1;
1365 
1366 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1367 }
1368 
1369 static struct netdev_queue *
1370 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1371 {
1372 	struct net_device *dev = qdisc_dev(sch);
1373 	struct tc_htb_qopt_offload offload_opt;
1374 	struct htb_sched *q = qdisc_priv(sch);
1375 	int err;
1376 
1377 	if (!q->offload)
1378 		return sch->dev_queue;
1379 
1380 	offload_opt = (struct tc_htb_qopt_offload) {
1381 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1382 		.classid = TC_H_MIN(tcm->tcm_parent),
1383 	};
1384 	err = htb_offload(dev, &offload_opt);
1385 	if (err || offload_opt.qid >= dev->num_tx_queues)
1386 		return NULL;
1387 	return netdev_get_tx_queue(dev, offload_opt.qid);
1388 }
1389 
1390 static struct Qdisc *
1391 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1392 {
1393 	struct net_device *dev = dev_queue->dev;
1394 	struct Qdisc *old_q;
1395 
1396 	if (dev->flags & IFF_UP)
1397 		dev_deactivate(dev);
1398 	old_q = dev_graft_qdisc(dev_queue, new_q);
1399 	if (new_q)
1400 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1401 	if (dev->flags & IFF_UP)
1402 		dev_activate(dev);
1403 
1404 	return old_q;
1405 }
1406 
1407 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1408 {
1409 	struct netdev_queue *queue;
1410 
1411 	queue = cl->leaf.offload_queue;
1412 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1413 		WARN_ON(cl->leaf.q->dev_queue != queue);
1414 
1415 	return queue;
1416 }
1417 
1418 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1419 				   struct htb_class *cl_new, bool destroying)
1420 {
1421 	struct netdev_queue *queue_old, *queue_new;
1422 	struct net_device *dev = qdisc_dev(sch);
1423 
1424 	queue_old = htb_offload_get_queue(cl_old);
1425 	queue_new = htb_offload_get_queue(cl_new);
1426 
1427 	if (!destroying) {
1428 		struct Qdisc *qdisc;
1429 
1430 		if (dev->flags & IFF_UP)
1431 			dev_deactivate(dev);
1432 		qdisc = dev_graft_qdisc(queue_old, NULL);
1433 		WARN_ON(qdisc != cl_old->leaf.q);
1434 	}
1435 
1436 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1437 		cl_old->leaf.q->dev_queue = queue_new;
1438 	cl_old->leaf.offload_queue = queue_new;
1439 
1440 	if (!destroying) {
1441 		struct Qdisc *qdisc;
1442 
1443 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1444 		if (dev->flags & IFF_UP)
1445 			dev_activate(dev);
1446 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1447 	}
1448 }
1449 
1450 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1451 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1452 {
1453 	struct netdev_queue *dev_queue = sch->dev_queue;
1454 	struct htb_class *cl = (struct htb_class *)arg;
1455 	struct htb_sched *q = qdisc_priv(sch);
1456 	struct Qdisc *old_q;
1457 
1458 	if (cl->level)
1459 		return -EINVAL;
1460 
1461 	if (q->offload)
1462 		dev_queue = htb_offload_get_queue(cl);
1463 
1464 	if (!new) {
1465 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1466 					cl->common.classid, extack);
1467 		if (!new)
1468 			return -ENOBUFS;
1469 	}
1470 
1471 	if (q->offload) {
1472 		htb_set_lockdep_class_child(new);
1473 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1474 		qdisc_refcount_inc(new);
1475 		old_q = htb_graft_helper(dev_queue, new);
1476 	}
1477 
1478 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1479 
1480 	if (q->offload) {
1481 		WARN_ON(old_q != *old);
1482 		qdisc_put(old_q);
1483 	}
1484 
1485 	return 0;
1486 }
1487 
1488 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1489 {
1490 	struct htb_class *cl = (struct htb_class *)arg;
1491 	return !cl->level ? cl->leaf.q : NULL;
1492 }
1493 
1494 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1495 {
1496 	struct htb_class *cl = (struct htb_class *)arg;
1497 
1498 	htb_deactivate(qdisc_priv(sch), cl);
1499 }
1500 
1501 static inline int htb_parent_last_child(struct htb_class *cl)
1502 {
1503 	if (!cl->parent)
1504 		/* the root class */
1505 		return 0;
1506 	if (cl->parent->children > 1)
1507 		/* not the last child */
1508 		return 0;
1509 	return 1;
1510 }
1511 
1512 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1513 			       struct Qdisc *new_q)
1514 {
1515 	struct htb_sched *q = qdisc_priv(sch);
1516 	struct htb_class *parent = cl->parent;
1517 
1518 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1519 
1520 	if (parent->cmode != HTB_CAN_SEND)
1521 		htb_safe_rb_erase(&parent->pq_node,
1522 				  &q->hlevel[parent->level].wait_pq);
1523 
1524 	parent->level = 0;
1525 	memset(&parent->inner, 0, sizeof(parent->inner));
1526 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1527 	parent->tokens = parent->buffer;
1528 	parent->ctokens = parent->cbuffer;
1529 	parent->t_c = ktime_get_ns();
1530 	parent->cmode = HTB_CAN_SEND;
1531 	if (q->offload)
1532 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1533 }
1534 
1535 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1536 				       struct netdev_queue *dev_queue,
1537 				       struct Qdisc *new_q)
1538 {
1539 	struct Qdisc *old_q;
1540 
1541 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1542 	if (new_q)
1543 		qdisc_refcount_inc(new_q);
1544 	old_q = htb_graft_helper(dev_queue, new_q);
1545 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1546 }
1547 
1548 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1549 				     bool last_child, bool destroying,
1550 				     struct netlink_ext_ack *extack)
1551 {
1552 	struct tc_htb_qopt_offload offload_opt;
1553 	struct netdev_queue *dev_queue;
1554 	struct Qdisc *q = cl->leaf.q;
1555 	struct Qdisc *old;
1556 	int err;
1557 
1558 	if (cl->level)
1559 		return -EINVAL;
1560 
1561 	WARN_ON(!q);
1562 	dev_queue = htb_offload_get_queue(cl);
1563 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1564 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1565 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1566 	 */
1567 	if (!destroying) {
1568 		old = htb_graft_helper(dev_queue, NULL);
1569 		/* Last qdisc grafted should be the same as cl->leaf.q when
1570 		 * calling htb_delete.
1571 		 */
1572 		WARN_ON(old != q);
1573 	}
1574 
1575 	if (cl->parent) {
1576 		_bstats_update(&cl->parent->bstats_bias,
1577 			       u64_stats_read(&q->bstats.bytes),
1578 			       u64_stats_read(&q->bstats.packets));
1579 	}
1580 
1581 	offload_opt = (struct tc_htb_qopt_offload) {
1582 		.command = !last_child ? TC_HTB_LEAF_DEL :
1583 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1584 			   TC_HTB_LEAF_DEL_LAST,
1585 		.classid = cl->common.classid,
1586 		.extack = extack,
1587 	};
1588 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1589 
1590 	if (!destroying) {
1591 		if (!err)
1592 			qdisc_put(old);
1593 		else
1594 			htb_graft_helper(dev_queue, old);
1595 	}
1596 
1597 	if (last_child)
1598 		return err;
1599 
1600 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1601 		u32 classid = TC_H_MAJ(sch->handle) |
1602 			      TC_H_MIN(offload_opt.classid);
1603 		struct htb_class *moved_cl = htb_find(classid, sch);
1604 
1605 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1606 	}
1607 
1608 	return err;
1609 }
1610 
1611 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1612 {
1613 	if (!cl->level) {
1614 		WARN_ON(!cl->leaf.q);
1615 		qdisc_put(cl->leaf.q);
1616 	}
1617 	gen_kill_estimator(&cl->rate_est);
1618 	tcf_block_put(cl->block);
1619 	kfree(cl);
1620 }
1621 
1622 static void htb_destroy(struct Qdisc *sch)
1623 {
1624 	struct net_device *dev = qdisc_dev(sch);
1625 	struct tc_htb_qopt_offload offload_opt;
1626 	struct htb_sched *q = qdisc_priv(sch);
1627 	struct hlist_node *next;
1628 	bool nonempty, changed;
1629 	struct htb_class *cl;
1630 	unsigned int i;
1631 
1632 	cancel_work_sync(&q->work);
1633 	qdisc_watchdog_cancel(&q->watchdog);
1634 	/* This line used to be after htb_destroy_class call below
1635 	 * and surprisingly it worked in 2.4. But it must precede it
1636 	 * because filter need its target class alive to be able to call
1637 	 * unbind_filter on it (without Oops).
1638 	 */
1639 	tcf_block_put(q->block);
1640 
1641 	for (i = 0; i < q->clhash.hashsize; i++) {
1642 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1643 			tcf_block_put(cl->block);
1644 			cl->block = NULL;
1645 		}
1646 	}
1647 
1648 	do {
1649 		nonempty = false;
1650 		changed = false;
1651 		for (i = 0; i < q->clhash.hashsize; i++) {
1652 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1653 						  common.hnode) {
1654 				bool last_child;
1655 
1656 				if (!q->offload) {
1657 					htb_destroy_class(sch, cl);
1658 					continue;
1659 				}
1660 
1661 				nonempty = true;
1662 
1663 				if (cl->level)
1664 					continue;
1665 
1666 				changed = true;
1667 
1668 				last_child = htb_parent_last_child(cl);
1669 				htb_destroy_class_offload(sch, cl, last_child,
1670 							  true, NULL);
1671 				qdisc_class_hash_remove(&q->clhash,
1672 							&cl->common);
1673 				if (cl->parent)
1674 					cl->parent->children--;
1675 				if (last_child)
1676 					htb_parent_to_leaf(sch, cl, NULL);
1677 				htb_destroy_class(sch, cl);
1678 			}
1679 		}
1680 	} while (changed);
1681 	WARN_ON(nonempty);
1682 
1683 	qdisc_class_hash_destroy(&q->clhash);
1684 	__qdisc_reset_queue(&q->direct_queue);
1685 
1686 	if (q->offload) {
1687 		offload_opt = (struct tc_htb_qopt_offload) {
1688 			.command = TC_HTB_DESTROY,
1689 		};
1690 		htb_offload(dev, &offload_opt);
1691 	}
1692 
1693 	if (!q->direct_qdiscs)
1694 		return;
1695 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1696 		qdisc_put(q->direct_qdiscs[i]);
1697 	kfree(q->direct_qdiscs);
1698 }
1699 
1700 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1701 		      struct netlink_ext_ack *extack)
1702 {
1703 	struct htb_sched *q = qdisc_priv(sch);
1704 	struct htb_class *cl = (struct htb_class *)arg;
1705 	struct Qdisc *new_q = NULL;
1706 	int last_child = 0;
1707 	int err;
1708 
1709 	/* TODO: why don't allow to delete subtree ? references ? does
1710 	 * tc subsys guarantee us that in htb_destroy it holds no class
1711 	 * refs so that we can remove children safely there ?
1712 	 */
1713 	if (cl->children || cl->filter_cnt)
1714 		return -EBUSY;
1715 
1716 	if (!cl->level && htb_parent_last_child(cl))
1717 		last_child = 1;
1718 
1719 	if (q->offload) {
1720 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1721 						extack);
1722 		if (err)
1723 			return err;
1724 	}
1725 
1726 	if (last_child) {
1727 		struct netdev_queue *dev_queue = sch->dev_queue;
1728 
1729 		if (q->offload)
1730 			dev_queue = htb_offload_get_queue(cl);
1731 
1732 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1733 					  cl->parent->common.classid,
1734 					  NULL);
1735 		if (q->offload) {
1736 			if (new_q)
1737 				htb_set_lockdep_class_child(new_q);
1738 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1739 		}
1740 	}
1741 
1742 	sch_tree_lock(sch);
1743 
1744 	if (!cl->level)
1745 		qdisc_purge_queue(cl->leaf.q);
1746 
1747 	/* delete from hash and active; remainder in destroy_class */
1748 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1749 	if (cl->parent)
1750 		cl->parent->children--;
1751 
1752 	if (cl->prio_activity)
1753 		htb_deactivate(q, cl);
1754 
1755 	if (cl->cmode != HTB_CAN_SEND)
1756 		htb_safe_rb_erase(&cl->pq_node,
1757 				  &q->hlevel[cl->level].wait_pq);
1758 
1759 	if (last_child)
1760 		htb_parent_to_leaf(sch, cl, new_q);
1761 
1762 	sch_tree_unlock(sch);
1763 
1764 	htb_destroy_class(sch, cl);
1765 	return 0;
1766 }
1767 
1768 static int htb_change_class(struct Qdisc *sch, u32 classid,
1769 			    u32 parentid, struct nlattr **tca,
1770 			    unsigned long *arg, struct netlink_ext_ack *extack)
1771 {
1772 	int err = -EINVAL;
1773 	struct htb_sched *q = qdisc_priv(sch);
1774 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1775 	struct tc_htb_qopt_offload offload_opt;
1776 	struct nlattr *opt = tca[TCA_OPTIONS];
1777 	struct nlattr *tb[TCA_HTB_MAX + 1];
1778 	struct Qdisc *parent_qdisc = NULL;
1779 	struct netdev_queue *dev_queue;
1780 	struct tc_htb_opt *hopt;
1781 	u64 rate64, ceil64;
1782 	int warn = 0;
1783 
1784 	/* extract all subattrs from opt attr */
1785 	if (!opt)
1786 		goto failure;
1787 
1788 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1789 					  extack);
1790 	if (err < 0)
1791 		goto failure;
1792 
1793 	err = -EINVAL;
1794 	if (tb[TCA_HTB_PARMS] == NULL)
1795 		goto failure;
1796 
1797 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1798 
1799 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1800 	if (!hopt->rate.rate || !hopt->ceil.rate)
1801 		goto failure;
1802 
1803 	if (q->offload) {
1804 		/* Options not supported by the offload. */
1805 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1806 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1807 			goto failure;
1808 		}
1809 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1810 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1811 			goto failure;
1812 		}
1813 		if (hopt->quantum) {
1814 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter");
1815 			goto failure;
1816 		}
1817 	}
1818 
1819 	/* Keeping backward compatible with rate_table based iproute2 tc */
1820 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1821 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1822 					      NULL));
1823 
1824 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1825 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1826 					      NULL));
1827 
1828 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1829 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1830 
1831 	if (!cl) {		/* new class */
1832 		struct net_device *dev = qdisc_dev(sch);
1833 		struct Qdisc *new_q, *old_q;
1834 		int prio;
1835 		struct {
1836 			struct nlattr		nla;
1837 			struct gnet_estimator	opt;
1838 		} est = {
1839 			.nla = {
1840 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1841 				.nla_type	= TCA_RATE,
1842 			},
1843 			.opt = {
1844 				/* 4s interval, 16s averaging constant */
1845 				.interval	= 2,
1846 				.ewma_log	= 2,
1847 			},
1848 		};
1849 
1850 		/* check for valid classid */
1851 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1852 		    htb_find(classid, sch))
1853 			goto failure;
1854 
1855 		/* check maximal depth */
1856 		if (parent && parent->parent && parent->parent->level < 2) {
1857 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1858 			goto failure;
1859 		}
1860 		err = -ENOBUFS;
1861 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1862 		if (!cl)
1863 			goto failure;
1864 
1865 		gnet_stats_basic_sync_init(&cl->bstats);
1866 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1867 
1868 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1869 		if (err) {
1870 			kfree(cl);
1871 			goto failure;
1872 		}
1873 		if (htb_rate_est || tca[TCA_RATE]) {
1874 			err = gen_new_estimator(&cl->bstats, NULL,
1875 						&cl->rate_est,
1876 						NULL,
1877 						true,
1878 						tca[TCA_RATE] ? : &est.nla);
1879 			if (err)
1880 				goto err_block_put;
1881 		}
1882 
1883 		cl->children = 0;
1884 		RB_CLEAR_NODE(&cl->pq_node);
1885 
1886 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1887 			RB_CLEAR_NODE(&cl->node[prio]);
1888 
1889 		cl->common.classid = classid;
1890 
1891 		/* Make sure nothing interrupts us in between of two
1892 		 * ndo_setup_tc calls.
1893 		 */
1894 		ASSERT_RTNL();
1895 
1896 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1897 		 * so that can't be used inside of sch_tree_lock
1898 		 * -- thanks to Karlis Peisenieks
1899 		 */
1900 		if (!q->offload) {
1901 			dev_queue = sch->dev_queue;
1902 		} else if (!(parent && !parent->level)) {
1903 			/* Assign a dev_queue to this classid. */
1904 			offload_opt = (struct tc_htb_qopt_offload) {
1905 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1906 				.classid = cl->common.classid,
1907 				.parent_classid = parent ?
1908 					TC_H_MIN(parent->common.classid) :
1909 					TC_HTB_CLASSID_ROOT,
1910 				.rate = max_t(u64, hopt->rate.rate, rate64),
1911 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1912 				.prio = hopt->prio,
1913 				.extack = extack,
1914 			};
1915 			err = htb_offload(dev, &offload_opt);
1916 			if (err) {
1917 				NL_SET_ERR_MSG_WEAK(extack,
1918 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1919 				goto err_kill_estimator;
1920 			}
1921 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1922 		} else { /* First child. */
1923 			dev_queue = htb_offload_get_queue(parent);
1924 			old_q = htb_graft_helper(dev_queue, NULL);
1925 			WARN_ON(old_q != parent->leaf.q);
1926 			offload_opt = (struct tc_htb_qopt_offload) {
1927 				.command = TC_HTB_LEAF_TO_INNER,
1928 				.classid = cl->common.classid,
1929 				.parent_classid =
1930 					TC_H_MIN(parent->common.classid),
1931 				.rate = max_t(u64, hopt->rate.rate, rate64),
1932 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1933 				.prio = hopt->prio,
1934 				.extack = extack,
1935 			};
1936 			err = htb_offload(dev, &offload_opt);
1937 			if (err) {
1938 				NL_SET_ERR_MSG_WEAK(extack,
1939 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1940 				htb_graft_helper(dev_queue, old_q);
1941 				goto err_kill_estimator;
1942 			}
1943 			_bstats_update(&parent->bstats_bias,
1944 				       u64_stats_read(&old_q->bstats.bytes),
1945 				       u64_stats_read(&old_q->bstats.packets));
1946 			qdisc_put(old_q);
1947 		}
1948 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1949 					  classid, NULL);
1950 		if (q->offload) {
1951 			if (new_q) {
1952 				htb_set_lockdep_class_child(new_q);
1953 				/* One ref for cl->leaf.q, the other for
1954 				 * dev_queue->qdisc.
1955 				 */
1956 				qdisc_refcount_inc(new_q);
1957 			}
1958 			old_q = htb_graft_helper(dev_queue, new_q);
1959 			/* No qdisc_put needed. */
1960 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1961 		}
1962 		sch_tree_lock(sch);
1963 		if (parent && !parent->level) {
1964 			/* turn parent into inner node */
1965 			qdisc_purge_queue(parent->leaf.q);
1966 			parent_qdisc = parent->leaf.q;
1967 			if (parent->prio_activity)
1968 				htb_deactivate(q, parent);
1969 
1970 			/* remove from evt list because of level change */
1971 			if (parent->cmode != HTB_CAN_SEND) {
1972 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1973 				parent->cmode = HTB_CAN_SEND;
1974 			}
1975 			parent->level = (parent->parent ? parent->parent->level
1976 					 : TC_HTB_MAXDEPTH) - 1;
1977 			memset(&parent->inner, 0, sizeof(parent->inner));
1978 		}
1979 
1980 		/* leaf (we) needs elementary qdisc */
1981 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1982 		if (q->offload)
1983 			cl->leaf.offload_queue = dev_queue;
1984 
1985 		cl->parent = parent;
1986 
1987 		/* set class to be in HTB_CAN_SEND state */
1988 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1989 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1990 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1991 		cl->t_c = ktime_get_ns();
1992 		cl->cmode = HTB_CAN_SEND;
1993 
1994 		/* attach to the hash list and parent's family */
1995 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1996 		if (parent)
1997 			parent->children++;
1998 		if (cl->leaf.q != &noop_qdisc)
1999 			qdisc_hash_add(cl->leaf.q, true);
2000 	} else {
2001 		if (tca[TCA_RATE]) {
2002 			err = gen_replace_estimator(&cl->bstats, NULL,
2003 						    &cl->rate_est,
2004 						    NULL,
2005 						    true,
2006 						    tca[TCA_RATE]);
2007 			if (err)
2008 				return err;
2009 		}
2010 
2011 		if (q->offload) {
2012 			struct net_device *dev = qdisc_dev(sch);
2013 
2014 			offload_opt = (struct tc_htb_qopt_offload) {
2015 				.command = TC_HTB_NODE_MODIFY,
2016 				.classid = cl->common.classid,
2017 				.rate = max_t(u64, hopt->rate.rate, rate64),
2018 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2019 				.prio = hopt->prio,
2020 				.extack = extack,
2021 			};
2022 			err = htb_offload(dev, &offload_opt);
2023 			if (err)
2024 				/* Estimator was replaced, and rollback may fail
2025 				 * as well, so we don't try to recover it, and
2026 				 * the estimator won't work property with the
2027 				 * offload anyway, because bstats are updated
2028 				 * only when the stats are queried.
2029 				 */
2030 				return err;
2031 		}
2032 
2033 		sch_tree_lock(sch);
2034 	}
2035 
2036 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2037 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2038 
2039 	/* it used to be a nasty bug here, we have to check that node
2040 	 * is really leaf before changing cl->leaf !
2041 	 */
2042 	if (!cl->level) {
2043 		u64 quantum = cl->rate.rate_bytes_ps;
2044 
2045 		do_div(quantum, q->rate2quantum);
2046 		cl->quantum = min_t(u64, quantum, INT_MAX);
2047 
2048 		if (!hopt->quantum && cl->quantum < 1000) {
2049 			warn = -1;
2050 			cl->quantum = 1000;
2051 		}
2052 		if (!hopt->quantum && cl->quantum > 200000) {
2053 			warn = 1;
2054 			cl->quantum = 200000;
2055 		}
2056 		if (hopt->quantum)
2057 			cl->quantum = hopt->quantum;
2058 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2059 			cl->prio = TC_HTB_NUMPRIO - 1;
2060 	}
2061 
2062 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2063 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2064 
2065 	sch_tree_unlock(sch);
2066 	qdisc_put(parent_qdisc);
2067 
2068 	if (warn)
2069 		NL_SET_ERR_MSG_FMT_MOD(extack,
2070 				       "quantum of class %X is %s. Consider r2q change.",
2071 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2072 
2073 	qdisc_class_hash_grow(sch, &q->clhash);
2074 
2075 	*arg = (unsigned long)cl;
2076 	return 0;
2077 
2078 err_kill_estimator:
2079 	gen_kill_estimator(&cl->rate_est);
2080 err_block_put:
2081 	tcf_block_put(cl->block);
2082 	kfree(cl);
2083 failure:
2084 	return err;
2085 }
2086 
2087 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2088 				       struct netlink_ext_ack *extack)
2089 {
2090 	struct htb_sched *q = qdisc_priv(sch);
2091 	struct htb_class *cl = (struct htb_class *)arg;
2092 
2093 	return cl ? cl->block : q->block;
2094 }
2095 
2096 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2097 				     u32 classid)
2098 {
2099 	struct htb_class *cl = htb_find(classid, sch);
2100 
2101 	/*if (cl && !cl->level) return 0;
2102 	 * The line above used to be there to prevent attaching filters to
2103 	 * leaves. But at least tc_index filter uses this just to get class
2104 	 * for other reasons so that we have to allow for it.
2105 	 * ----
2106 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2107 	 * another way to "lock" the class - unlike "get" this lock can
2108 	 * be broken by class during destroy IIUC.
2109 	 */
2110 	if (cl)
2111 		cl->filter_cnt++;
2112 	return (unsigned long)cl;
2113 }
2114 
2115 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2116 {
2117 	struct htb_class *cl = (struct htb_class *)arg;
2118 
2119 	if (cl)
2120 		cl->filter_cnt--;
2121 }
2122 
2123 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2124 {
2125 	struct htb_sched *q = qdisc_priv(sch);
2126 	struct htb_class *cl;
2127 	unsigned int i;
2128 
2129 	if (arg->stop)
2130 		return;
2131 
2132 	for (i = 0; i < q->clhash.hashsize; i++) {
2133 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2134 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2135 				return;
2136 		}
2137 	}
2138 }
2139 
2140 static const struct Qdisc_class_ops htb_class_ops = {
2141 	.select_queue	=	htb_select_queue,
2142 	.graft		=	htb_graft,
2143 	.leaf		=	htb_leaf,
2144 	.qlen_notify	=	htb_qlen_notify,
2145 	.find		=	htb_search,
2146 	.change		=	htb_change_class,
2147 	.delete		=	htb_delete,
2148 	.walk		=	htb_walk,
2149 	.tcf_block	=	htb_tcf_block,
2150 	.bind_tcf	=	htb_bind_filter,
2151 	.unbind_tcf	=	htb_unbind_filter,
2152 	.dump		=	htb_dump_class,
2153 	.dump_stats	=	htb_dump_class_stats,
2154 };
2155 
2156 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2157 	.cl_ops		=	&htb_class_ops,
2158 	.id		=	"htb",
2159 	.priv_size	=	sizeof(struct htb_sched),
2160 	.enqueue	=	htb_enqueue,
2161 	.dequeue	=	htb_dequeue,
2162 	.peek		=	qdisc_peek_dequeued,
2163 	.init		=	htb_init,
2164 	.attach		=	htb_attach,
2165 	.reset		=	htb_reset,
2166 	.destroy	=	htb_destroy,
2167 	.dump		=	htb_dump,
2168 	.owner		=	THIS_MODULE,
2169 };
2170 
2171 static int __init htb_module_init(void)
2172 {
2173 	return register_qdisc(&htb_qdisc_ops);
2174 }
2175 static void __exit htb_module_exit(void)
2176 {
2177 	unregister_qdisc(&htb_qdisc_ops);
2178 }
2179 
2180 module_init(htb_module_init)
2181 module_exit(htb_module_exit)
2182 MODULE_LICENSE("GPL");
2183