xref: /openbmc/linux/net/sched/sch_htb.c (revision 0f4b20ef)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 	int			filter_cnt;
106 
107 	int			level;		/* our level (see above) */
108 	unsigned int		children;
109 	struct htb_class	*parent;	/* parent class */
110 
111 	struct net_rate_estimator __rcu *rate_est;
112 
113 	/*
114 	 * Written often fields
115 	 */
116 	struct gnet_stats_basic_sync bstats;
117 	struct gnet_stats_basic_sync bstats_bias;
118 	struct tc_htb_xstats	xstats;	/* our special stats */
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 /**
203  * htb_classify - classify a packet into class
204  *
205  * It returns NULL if the packet should be dropped or -1 if the packet
206  * should be passed directly thru. In all other cases leaf class is returned.
207  * We allow direct class selection by classid in priority. The we examine
208  * filters in qdisc and in inner nodes (if higher filter points to the inner
209  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
210  * internal fifo (direct). These packets then go directly thru. If we still
211  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
212  * then finish and return direct queue.
213  */
214 #define HTB_DIRECT ((struct htb_class *)-1L)
215 
216 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
217 				      int *qerr)
218 {
219 	struct htb_sched *q = qdisc_priv(sch);
220 	struct htb_class *cl;
221 	struct tcf_result res;
222 	struct tcf_proto *tcf;
223 	int result;
224 
225 	/* allow to select class by setting skb->priority to valid classid;
226 	 * note that nfmark can be used too by attaching filter fw with no
227 	 * rules in it
228 	 */
229 	if (skb->priority == sch->handle)
230 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
231 	cl = htb_find(skb->priority, sch);
232 	if (cl) {
233 		if (cl->level == 0)
234 			return cl;
235 		/* Start with inner filter chain if a non-leaf class is selected */
236 		tcf = rcu_dereference_bh(cl->filter_list);
237 	} else {
238 		tcf = rcu_dereference_bh(q->filter_list);
239 	}
240 
241 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
242 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
243 #ifdef CONFIG_NET_CLS_ACT
244 		switch (result) {
245 		case TC_ACT_QUEUED:
246 		case TC_ACT_STOLEN:
247 		case TC_ACT_TRAP:
248 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
249 			fallthrough;
250 		case TC_ACT_SHOT:
251 			return NULL;
252 		}
253 #endif
254 		cl = (void *)res.class;
255 		if (!cl) {
256 			if (res.classid == sch->handle)
257 				return HTB_DIRECT;	/* X:0 (direct flow) */
258 			cl = htb_find(res.classid, sch);
259 			if (!cl)
260 				break;	/* filter selected invalid classid */
261 		}
262 		if (!cl->level)
263 			return cl;	/* we hit leaf; return it */
264 
265 		/* we have got inner class; apply inner filter chain */
266 		tcf = rcu_dereference_bh(cl->filter_list);
267 	}
268 	/* classification failed; try to use default class */
269 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
270 	if (!cl || cl->level)
271 		return HTB_DIRECT;	/* bad default .. this is safe bet */
272 	return cl;
273 }
274 
275 /**
276  * htb_add_to_id_tree - adds class to the round robin list
277  * @root: the root of the tree
278  * @cl: the class to add
279  * @prio: the give prio in class
280  *
281  * Routine adds class to the list (actually tree) sorted by classid.
282  * Make sure that class is not already on such list for given prio.
283  */
284 static void htb_add_to_id_tree(struct rb_root *root,
285 			       struct htb_class *cl, int prio)
286 {
287 	struct rb_node **p = &root->rb_node, *parent = NULL;
288 
289 	while (*p) {
290 		struct htb_class *c;
291 		parent = *p;
292 		c = rb_entry(parent, struct htb_class, node[prio]);
293 
294 		if (cl->common.classid > c->common.classid)
295 			p = &parent->rb_right;
296 		else
297 			p = &parent->rb_left;
298 	}
299 	rb_link_node(&cl->node[prio], parent, p);
300 	rb_insert_color(&cl->node[prio], root);
301 }
302 
303 /**
304  * htb_add_to_wait_tree - adds class to the event queue with delay
305  * @q: the priority event queue
306  * @cl: the class to add
307  * @delay: delay in microseconds
308  *
309  * The class is added to priority event queue to indicate that class will
310  * change its mode in cl->pq_key microseconds. Make sure that class is not
311  * already in the queue.
312  */
313 static void htb_add_to_wait_tree(struct htb_sched *q,
314 				 struct htb_class *cl, s64 delay)
315 {
316 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
317 
318 	cl->pq_key = q->now + delay;
319 	if (cl->pq_key == q->now)
320 		cl->pq_key++;
321 
322 	/* update the nearest event cache */
323 	if (q->near_ev_cache[cl->level] > cl->pq_key)
324 		q->near_ev_cache[cl->level] = cl->pq_key;
325 
326 	while (*p) {
327 		struct htb_class *c;
328 		parent = *p;
329 		c = rb_entry(parent, struct htb_class, pq_node);
330 		if (cl->pq_key >= c->pq_key)
331 			p = &parent->rb_right;
332 		else
333 			p = &parent->rb_left;
334 	}
335 	rb_link_node(&cl->pq_node, parent, p);
336 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
337 }
338 
339 /**
340  * htb_next_rb_node - finds next node in binary tree
341  * @n: the current node in binary tree
342  *
343  * When we are past last key we return NULL.
344  * Average complexity is 2 steps per call.
345  */
346 static inline void htb_next_rb_node(struct rb_node **n)
347 {
348 	*n = rb_next(*n);
349 }
350 
351 /**
352  * htb_add_class_to_row - add class to its row
353  * @q: the priority event queue
354  * @cl: the class to add
355  * @mask: the given priorities in class in bitmap
356  *
357  * The class is added to row at priorities marked in mask.
358  * It does nothing if mask == 0.
359  */
360 static inline void htb_add_class_to_row(struct htb_sched *q,
361 					struct htb_class *cl, int mask)
362 {
363 	q->row_mask[cl->level] |= mask;
364 	while (mask) {
365 		int prio = ffz(~mask);
366 		mask &= ~(1 << prio);
367 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
368 	}
369 }
370 
371 /* If this triggers, it is a bug in this code, but it need not be fatal */
372 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
373 {
374 	if (RB_EMPTY_NODE(rb)) {
375 		WARN_ON(1);
376 	} else {
377 		rb_erase(rb, root);
378 		RB_CLEAR_NODE(rb);
379 	}
380 }
381 
382 
383 /**
384  * htb_remove_class_from_row - removes class from its row
385  * @q: the priority event queue
386  * @cl: the class to add
387  * @mask: the given priorities in class in bitmap
388  *
389  * The class is removed from row at priorities marked in mask.
390  * It does nothing if mask == 0.
391  */
392 static inline void htb_remove_class_from_row(struct htb_sched *q,
393 						 struct htb_class *cl, int mask)
394 {
395 	int m = 0;
396 	struct htb_level *hlevel = &q->hlevel[cl->level];
397 
398 	while (mask) {
399 		int prio = ffz(~mask);
400 		struct htb_prio *hprio = &hlevel->hprio[prio];
401 
402 		mask &= ~(1 << prio);
403 		if (hprio->ptr == cl->node + prio)
404 			htb_next_rb_node(&hprio->ptr);
405 
406 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
407 		if (!hprio->row.rb_node)
408 			m |= 1 << prio;
409 	}
410 	q->row_mask[cl->level] &= ~m;
411 }
412 
413 /**
414  * htb_activate_prios - creates active classe's feed chain
415  * @q: the priority event queue
416  * @cl: the class to activate
417  *
418  * The class is connected to ancestors and/or appropriate rows
419  * for priorities it is participating on. cl->cmode must be new
420  * (activated) mode. It does nothing if cl->prio_activity == 0.
421  */
422 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
423 {
424 	struct htb_class *p = cl->parent;
425 	long m, mask = cl->prio_activity;
426 
427 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
428 		m = mask;
429 		while (m) {
430 			int prio = ffz(~m);
431 			m &= ~(1 << prio);
432 
433 			if (p->inner.clprio[prio].feed.rb_node)
434 				/* parent already has its feed in use so that
435 				 * reset bit in mask as parent is already ok
436 				 */
437 				mask &= ~(1 << prio);
438 
439 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
440 		}
441 		p->prio_activity |= mask;
442 		cl = p;
443 		p = cl->parent;
444 
445 	}
446 	if (cl->cmode == HTB_CAN_SEND && mask)
447 		htb_add_class_to_row(q, cl, mask);
448 }
449 
450 /**
451  * htb_deactivate_prios - remove class from feed chain
452  * @q: the priority event queue
453  * @cl: the class to deactivate
454  *
455  * cl->cmode must represent old mode (before deactivation). It does
456  * nothing if cl->prio_activity == 0. Class is removed from all feed
457  * chains and rows.
458  */
459 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
460 {
461 	struct htb_class *p = cl->parent;
462 	long m, mask = cl->prio_activity;
463 
464 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
465 		m = mask;
466 		mask = 0;
467 		while (m) {
468 			int prio = ffz(~m);
469 			m &= ~(1 << prio);
470 
471 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
472 				/* we are removing child which is pointed to from
473 				 * parent feed - forget the pointer but remember
474 				 * classid
475 				 */
476 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
477 				p->inner.clprio[prio].ptr = NULL;
478 			}
479 
480 			htb_safe_rb_erase(cl->node + prio,
481 					  &p->inner.clprio[prio].feed);
482 
483 			if (!p->inner.clprio[prio].feed.rb_node)
484 				mask |= 1 << prio;
485 		}
486 
487 		p->prio_activity &= ~mask;
488 		cl = p;
489 		p = cl->parent;
490 
491 	}
492 	if (cl->cmode == HTB_CAN_SEND && mask)
493 		htb_remove_class_from_row(q, cl, mask);
494 }
495 
496 static inline s64 htb_lowater(const struct htb_class *cl)
497 {
498 	if (htb_hysteresis)
499 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
500 	else
501 		return 0;
502 }
503 static inline s64 htb_hiwater(const struct htb_class *cl)
504 {
505 	if (htb_hysteresis)
506 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
507 	else
508 		return 0;
509 }
510 
511 
512 /**
513  * htb_class_mode - computes and returns current class mode
514  * @cl: the target class
515  * @diff: diff time in microseconds
516  *
517  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
518  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
519  * from now to time when cl will change its state.
520  * Also it is worth to note that class mode doesn't change simply
521  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
522  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
523  * mode transitions per time unit. The speed gain is about 1/6.
524  */
525 static inline enum htb_cmode
526 htb_class_mode(struct htb_class *cl, s64 *diff)
527 {
528 	s64 toks;
529 
530 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
531 		*diff = -toks;
532 		return HTB_CANT_SEND;
533 	}
534 
535 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
536 		return HTB_CAN_SEND;
537 
538 	*diff = -toks;
539 	return HTB_MAY_BORROW;
540 }
541 
542 /**
543  * htb_change_class_mode - changes classe's mode
544  * @q: the priority event queue
545  * @cl: the target class
546  * @diff: diff time in microseconds
547  *
548  * This should be the only way how to change classe's mode under normal
549  * circumstances. Routine will update feed lists linkage, change mode
550  * and add class to the wait event queue if appropriate. New mode should
551  * be different from old one and cl->pq_key has to be valid if changing
552  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
553  */
554 static void
555 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
556 {
557 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
558 
559 	if (new_mode == cl->cmode)
560 		return;
561 
562 	if (new_mode == HTB_CANT_SEND) {
563 		cl->overlimits++;
564 		q->overlimits++;
565 	}
566 
567 	if (cl->prio_activity) {	/* not necessary: speed optimization */
568 		if (cl->cmode != HTB_CANT_SEND)
569 			htb_deactivate_prios(q, cl);
570 		cl->cmode = new_mode;
571 		if (new_mode != HTB_CANT_SEND)
572 			htb_activate_prios(q, cl);
573 	} else
574 		cl->cmode = new_mode;
575 }
576 
577 /**
578  * htb_activate - inserts leaf cl into appropriate active feeds
579  * @q: the priority event queue
580  * @cl: the target class
581  *
582  * Routine learns (new) priority of leaf and activates feed chain
583  * for the prio. It can be called on already active leaf safely.
584  * It also adds leaf into droplist.
585  */
586 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
587 {
588 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
589 
590 	if (!cl->prio_activity) {
591 		cl->prio_activity = 1 << cl->prio;
592 		htb_activate_prios(q, cl);
593 	}
594 }
595 
596 /**
597  * htb_deactivate - remove leaf cl from active feeds
598  * @q: the priority event queue
599  * @cl: the target class
600  *
601  * Make sure that leaf is active. In the other words it can't be called
602  * with non-active leaf. It also removes class from the drop list.
603  */
604 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
605 {
606 	WARN_ON(!cl->prio_activity);
607 
608 	htb_deactivate_prios(q, cl);
609 	cl->prio_activity = 0;
610 }
611 
612 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
613 		       struct sk_buff **to_free)
614 {
615 	int ret;
616 	unsigned int len = qdisc_pkt_len(skb);
617 	struct htb_sched *q = qdisc_priv(sch);
618 	struct htb_class *cl = htb_classify(skb, sch, &ret);
619 
620 	if (cl == HTB_DIRECT) {
621 		/* enqueue to helper queue */
622 		if (q->direct_queue.qlen < q->direct_qlen) {
623 			__qdisc_enqueue_tail(skb, &q->direct_queue);
624 			q->direct_pkts++;
625 		} else {
626 			return qdisc_drop(skb, sch, to_free);
627 		}
628 #ifdef CONFIG_NET_CLS_ACT
629 	} else if (!cl) {
630 		if (ret & __NET_XMIT_BYPASS)
631 			qdisc_qstats_drop(sch);
632 		__qdisc_drop(skb, to_free);
633 		return ret;
634 #endif
635 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
636 					to_free)) != NET_XMIT_SUCCESS) {
637 		if (net_xmit_drop_count(ret)) {
638 			qdisc_qstats_drop(sch);
639 			cl->drops++;
640 		}
641 		return ret;
642 	} else {
643 		htb_activate(q, cl);
644 	}
645 
646 	sch->qstats.backlog += len;
647 	sch->q.qlen++;
648 	return NET_XMIT_SUCCESS;
649 }
650 
651 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
652 {
653 	s64 toks = diff + cl->tokens;
654 
655 	if (toks > cl->buffer)
656 		toks = cl->buffer;
657 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
658 	if (toks <= -cl->mbuffer)
659 		toks = 1 - cl->mbuffer;
660 
661 	cl->tokens = toks;
662 }
663 
664 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
665 {
666 	s64 toks = diff + cl->ctokens;
667 
668 	if (toks > cl->cbuffer)
669 		toks = cl->cbuffer;
670 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
671 	if (toks <= -cl->mbuffer)
672 		toks = 1 - cl->mbuffer;
673 
674 	cl->ctokens = toks;
675 }
676 
677 /**
678  * htb_charge_class - charges amount "bytes" to leaf and ancestors
679  * @q: the priority event queue
680  * @cl: the class to start iterate
681  * @level: the minimum level to account
682  * @skb: the socket buffer
683  *
684  * Routine assumes that packet "bytes" long was dequeued from leaf cl
685  * borrowing from "level". It accounts bytes to ceil leaky bucket for
686  * leaf and all ancestors and to rate bucket for ancestors at levels
687  * "level" and higher. It also handles possible change of mode resulting
688  * from the update. Note that mode can also increase here (MAY_BORROW to
689  * CAN_SEND) because we can use more precise clock that event queue here.
690  * In such case we remove class from event queue first.
691  */
692 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
693 			     int level, struct sk_buff *skb)
694 {
695 	int bytes = qdisc_pkt_len(skb);
696 	enum htb_cmode old_mode;
697 	s64 diff;
698 
699 	while (cl) {
700 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
701 		if (cl->level >= level) {
702 			if (cl->level == level)
703 				cl->xstats.lends++;
704 			htb_accnt_tokens(cl, bytes, diff);
705 		} else {
706 			cl->xstats.borrows++;
707 			cl->tokens += diff;	/* we moved t_c; update tokens */
708 		}
709 		htb_accnt_ctokens(cl, bytes, diff);
710 		cl->t_c = q->now;
711 
712 		old_mode = cl->cmode;
713 		diff = 0;
714 		htb_change_class_mode(q, cl, &diff);
715 		if (old_mode != cl->cmode) {
716 			if (old_mode != HTB_CAN_SEND)
717 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
718 			if (cl->cmode != HTB_CAN_SEND)
719 				htb_add_to_wait_tree(q, cl, diff);
720 		}
721 
722 		/* update basic stats except for leaves which are already updated */
723 		if (cl->level)
724 			bstats_update(&cl->bstats, skb);
725 
726 		cl = cl->parent;
727 	}
728 }
729 
730 /**
731  * htb_do_events - make mode changes to classes at the level
732  * @q: the priority event queue
733  * @level: which wait_pq in 'q->hlevel'
734  * @start: start jiffies
735  *
736  * Scans event queue for pending events and applies them. Returns time of
737  * next pending event (0 for no event in pq, q->now for too many events).
738  * Note: Applied are events whose have cl->pq_key <= q->now.
739  */
740 static s64 htb_do_events(struct htb_sched *q, const int level,
741 			 unsigned long start)
742 {
743 	/* don't run for longer than 2 jiffies; 2 is used instead of
744 	 * 1 to simplify things when jiffy is going to be incremented
745 	 * too soon
746 	 */
747 	unsigned long stop_at = start + 2;
748 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
749 
750 	while (time_before(jiffies, stop_at)) {
751 		struct htb_class *cl;
752 		s64 diff;
753 		struct rb_node *p = rb_first(wait_pq);
754 
755 		if (!p)
756 			return 0;
757 
758 		cl = rb_entry(p, struct htb_class, pq_node);
759 		if (cl->pq_key > q->now)
760 			return cl->pq_key;
761 
762 		htb_safe_rb_erase(p, wait_pq);
763 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
764 		htb_change_class_mode(q, cl, &diff);
765 		if (cl->cmode != HTB_CAN_SEND)
766 			htb_add_to_wait_tree(q, cl, diff);
767 	}
768 
769 	/* too much load - let's continue after a break for scheduling */
770 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
771 		pr_warn("htb: too many events!\n");
772 		q->warned |= HTB_WARN_TOOMANYEVENTS;
773 	}
774 
775 	return q->now;
776 }
777 
778 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
779  * is no such one exists.
780  */
781 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
782 					      u32 id)
783 {
784 	struct rb_node *r = NULL;
785 	while (n) {
786 		struct htb_class *cl =
787 		    rb_entry(n, struct htb_class, node[prio]);
788 
789 		if (id > cl->common.classid) {
790 			n = n->rb_right;
791 		} else if (id < cl->common.classid) {
792 			r = n;
793 			n = n->rb_left;
794 		} else {
795 			return n;
796 		}
797 	}
798 	return r;
799 }
800 
801 /**
802  * htb_lookup_leaf - returns next leaf class in DRR order
803  * @hprio: the current one
804  * @prio: which prio in class
805  *
806  * Find leaf where current feed pointers points to.
807  */
808 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
809 {
810 	int i;
811 	struct {
812 		struct rb_node *root;
813 		struct rb_node **pptr;
814 		u32 *pid;
815 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
816 
817 	BUG_ON(!hprio->row.rb_node);
818 	sp->root = hprio->row.rb_node;
819 	sp->pptr = &hprio->ptr;
820 	sp->pid = &hprio->last_ptr_id;
821 
822 	for (i = 0; i < 65535; i++) {
823 		if (!*sp->pptr && *sp->pid) {
824 			/* ptr was invalidated but id is valid - try to recover
825 			 * the original or next ptr
826 			 */
827 			*sp->pptr =
828 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
829 		}
830 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
831 				 * can become out of date quickly
832 				 */
833 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
834 			*sp->pptr = sp->root;
835 			while ((*sp->pptr)->rb_left)
836 				*sp->pptr = (*sp->pptr)->rb_left;
837 			if (sp > stk) {
838 				sp--;
839 				if (!*sp->pptr) {
840 					WARN_ON(1);
841 					return NULL;
842 				}
843 				htb_next_rb_node(sp->pptr);
844 			}
845 		} else {
846 			struct htb_class *cl;
847 			struct htb_prio *clp;
848 
849 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
850 			if (!cl->level)
851 				return cl;
852 			clp = &cl->inner.clprio[prio];
853 			(++sp)->root = clp->feed.rb_node;
854 			sp->pptr = &clp->ptr;
855 			sp->pid = &clp->last_ptr_id;
856 		}
857 	}
858 	WARN_ON(1);
859 	return NULL;
860 }
861 
862 /* dequeues packet at given priority and level; call only if
863  * you are sure that there is active class at prio/level
864  */
865 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
866 					const int level)
867 {
868 	struct sk_buff *skb = NULL;
869 	struct htb_class *cl, *start;
870 	struct htb_level *hlevel = &q->hlevel[level];
871 	struct htb_prio *hprio = &hlevel->hprio[prio];
872 
873 	/* look initial class up in the row */
874 	start = cl = htb_lookup_leaf(hprio, prio);
875 
876 	do {
877 next:
878 		if (unlikely(!cl))
879 			return NULL;
880 
881 		/* class can be empty - it is unlikely but can be true if leaf
882 		 * qdisc drops packets in enqueue routine or if someone used
883 		 * graft operation on the leaf since last dequeue;
884 		 * simply deactivate and skip such class
885 		 */
886 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
887 			struct htb_class *next;
888 			htb_deactivate(q, cl);
889 
890 			/* row/level might become empty */
891 			if ((q->row_mask[level] & (1 << prio)) == 0)
892 				return NULL;
893 
894 			next = htb_lookup_leaf(hprio, prio);
895 
896 			if (cl == start)	/* fix start if we just deleted it */
897 				start = next;
898 			cl = next;
899 			goto next;
900 		}
901 
902 		skb = cl->leaf.q->dequeue(cl->leaf.q);
903 		if (likely(skb != NULL))
904 			break;
905 
906 		qdisc_warn_nonwc("htb", cl->leaf.q);
907 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
908 					 &q->hlevel[0].hprio[prio].ptr);
909 		cl = htb_lookup_leaf(hprio, prio);
910 
911 	} while (cl != start);
912 
913 	if (likely(skb != NULL)) {
914 		bstats_update(&cl->bstats, skb);
915 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
916 		if (cl->leaf.deficit[level] < 0) {
917 			cl->leaf.deficit[level] += cl->quantum;
918 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
919 						 &q->hlevel[0].hprio[prio].ptr);
920 		}
921 		/* this used to be after charge_class but this constelation
922 		 * gives us slightly better performance
923 		 */
924 		if (!cl->leaf.q->q.qlen)
925 			htb_deactivate(q, cl);
926 		htb_charge_class(q, cl, level, skb);
927 	}
928 	return skb;
929 }
930 
931 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
932 {
933 	struct sk_buff *skb;
934 	struct htb_sched *q = qdisc_priv(sch);
935 	int level;
936 	s64 next_event;
937 	unsigned long start_at;
938 
939 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
940 	skb = __qdisc_dequeue_head(&q->direct_queue);
941 	if (skb != NULL) {
942 ok:
943 		qdisc_bstats_update(sch, skb);
944 		qdisc_qstats_backlog_dec(sch, skb);
945 		sch->q.qlen--;
946 		return skb;
947 	}
948 
949 	if (!sch->q.qlen)
950 		goto fin;
951 	q->now = ktime_get_ns();
952 	start_at = jiffies;
953 
954 	next_event = q->now + 5LLU * NSEC_PER_SEC;
955 
956 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
957 		/* common case optimization - skip event handler quickly */
958 		int m;
959 		s64 event = q->near_ev_cache[level];
960 
961 		if (q->now >= event) {
962 			event = htb_do_events(q, level, start_at);
963 			if (!event)
964 				event = q->now + NSEC_PER_SEC;
965 			q->near_ev_cache[level] = event;
966 		}
967 
968 		if (next_event > event)
969 			next_event = event;
970 
971 		m = ~q->row_mask[level];
972 		while (m != (int)(-1)) {
973 			int prio = ffz(m);
974 
975 			m |= 1 << prio;
976 			skb = htb_dequeue_tree(q, prio, level);
977 			if (likely(skb != NULL))
978 				goto ok;
979 		}
980 	}
981 	if (likely(next_event > q->now))
982 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
983 	else
984 		schedule_work(&q->work);
985 fin:
986 	return skb;
987 }
988 
989 /* reset all classes */
990 /* always caled under BH & queue lock */
991 static void htb_reset(struct Qdisc *sch)
992 {
993 	struct htb_sched *q = qdisc_priv(sch);
994 	struct htb_class *cl;
995 	unsigned int i;
996 
997 	for (i = 0; i < q->clhash.hashsize; i++) {
998 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
999 			if (cl->level)
1000 				memset(&cl->inner, 0, sizeof(cl->inner));
1001 			else {
1002 				if (cl->leaf.q && !q->offload)
1003 					qdisc_reset(cl->leaf.q);
1004 			}
1005 			cl->prio_activity = 0;
1006 			cl->cmode = HTB_CAN_SEND;
1007 		}
1008 	}
1009 	qdisc_watchdog_cancel(&q->watchdog);
1010 	__qdisc_reset_queue(&q->direct_queue);
1011 	sch->q.qlen = 0;
1012 	sch->qstats.backlog = 0;
1013 	memset(q->hlevel, 0, sizeof(q->hlevel));
1014 	memset(q->row_mask, 0, sizeof(q->row_mask));
1015 }
1016 
1017 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1018 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1019 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1020 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1021 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1022 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1023 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1024 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1025 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1026 };
1027 
1028 static void htb_work_func(struct work_struct *work)
1029 {
1030 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1031 	struct Qdisc *sch = q->watchdog.qdisc;
1032 
1033 	rcu_read_lock();
1034 	__netif_schedule(qdisc_root(sch));
1035 	rcu_read_unlock();
1036 }
1037 
1038 static void htb_set_lockdep_class_child(struct Qdisc *q)
1039 {
1040 	static struct lock_class_key child_key;
1041 
1042 	lockdep_set_class(qdisc_lock(q), &child_key);
1043 }
1044 
1045 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1046 {
1047 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1048 }
1049 
1050 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1051 		    struct netlink_ext_ack *extack)
1052 {
1053 	struct net_device *dev = qdisc_dev(sch);
1054 	struct tc_htb_qopt_offload offload_opt;
1055 	struct htb_sched *q = qdisc_priv(sch);
1056 	struct nlattr *tb[TCA_HTB_MAX + 1];
1057 	struct tc_htb_glob *gopt;
1058 	unsigned int ntx;
1059 	bool offload;
1060 	int err;
1061 
1062 	qdisc_watchdog_init(&q->watchdog, sch);
1063 	INIT_WORK(&q->work, htb_work_func);
1064 
1065 	if (!opt)
1066 		return -EINVAL;
1067 
1068 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1069 	if (err)
1070 		return err;
1071 
1072 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1073 					  NULL);
1074 	if (err < 0)
1075 		return err;
1076 
1077 	if (!tb[TCA_HTB_INIT])
1078 		return -EINVAL;
1079 
1080 	gopt = nla_data(tb[TCA_HTB_INIT]);
1081 	if (gopt->version != HTB_VER >> 16)
1082 		return -EINVAL;
1083 
1084 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1085 
1086 	if (offload) {
1087 		if (sch->parent != TC_H_ROOT) {
1088 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1089 			return -EOPNOTSUPP;
1090 		}
1091 
1092 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1093 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1094 			return -EOPNOTSUPP;
1095 		}
1096 
1097 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1098 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1099 					   sizeof(*q->direct_qdiscs),
1100 					   GFP_KERNEL);
1101 		if (!q->direct_qdiscs)
1102 			return -ENOMEM;
1103 	}
1104 
1105 	err = qdisc_class_hash_init(&q->clhash);
1106 	if (err < 0)
1107 		goto err_free_direct_qdiscs;
1108 
1109 	qdisc_skb_head_init(&q->direct_queue);
1110 
1111 	if (tb[TCA_HTB_DIRECT_QLEN])
1112 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1113 	else
1114 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1115 
1116 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1117 		q->rate2quantum = 1;
1118 	q->defcls = gopt->defcls;
1119 
1120 	if (!offload)
1121 		return 0;
1122 
1123 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1124 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1125 		struct Qdisc *qdisc;
1126 
1127 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1128 					  TC_H_MAKE(sch->handle, 0), extack);
1129 		if (!qdisc) {
1130 			err = -ENOMEM;
1131 			goto err_free_qdiscs;
1132 		}
1133 
1134 		htb_set_lockdep_class_child(qdisc);
1135 		q->direct_qdiscs[ntx] = qdisc;
1136 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1137 	}
1138 
1139 	sch->flags |= TCQ_F_MQROOT;
1140 
1141 	offload_opt = (struct tc_htb_qopt_offload) {
1142 		.command = TC_HTB_CREATE,
1143 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1144 		.classid = TC_H_MIN(q->defcls),
1145 		.extack = extack,
1146 	};
1147 	err = htb_offload(dev, &offload_opt);
1148 	if (err)
1149 		goto err_free_qdiscs;
1150 
1151 	/* Defer this assignment, so that htb_destroy skips offload-related
1152 	 * parts (especially calling ndo_setup_tc) on errors.
1153 	 */
1154 	q->offload = true;
1155 
1156 	return 0;
1157 
1158 err_free_qdiscs:
1159 	for (ntx = 0; ntx < q->num_direct_qdiscs && q->direct_qdiscs[ntx];
1160 	     ntx++)
1161 		qdisc_put(q->direct_qdiscs[ntx]);
1162 
1163 	qdisc_class_hash_destroy(&q->clhash);
1164 	/* Prevent use-after-free and double-free when htb_destroy gets called.
1165 	 */
1166 	q->clhash.hash = NULL;
1167 	q->clhash.hashsize = 0;
1168 
1169 err_free_direct_qdiscs:
1170 	kfree(q->direct_qdiscs);
1171 	q->direct_qdiscs = NULL;
1172 	return err;
1173 }
1174 
1175 static void htb_attach_offload(struct Qdisc *sch)
1176 {
1177 	struct net_device *dev = qdisc_dev(sch);
1178 	struct htb_sched *q = qdisc_priv(sch);
1179 	unsigned int ntx;
1180 
1181 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1182 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1183 
1184 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1185 		qdisc_put(old);
1186 		qdisc_hash_add(qdisc, false);
1187 	}
1188 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1189 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1190 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1191 
1192 		qdisc_put(old);
1193 	}
1194 
1195 	kfree(q->direct_qdiscs);
1196 	q->direct_qdiscs = NULL;
1197 }
1198 
1199 static void htb_attach_software(struct Qdisc *sch)
1200 {
1201 	struct net_device *dev = qdisc_dev(sch);
1202 	unsigned int ntx;
1203 
1204 	/* Resemble qdisc_graft behavior. */
1205 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1206 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1207 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1208 
1209 		qdisc_refcount_inc(sch);
1210 
1211 		qdisc_put(old);
1212 	}
1213 }
1214 
1215 static void htb_attach(struct Qdisc *sch)
1216 {
1217 	struct htb_sched *q = qdisc_priv(sch);
1218 
1219 	if (q->offload)
1220 		htb_attach_offload(sch);
1221 	else
1222 		htb_attach_software(sch);
1223 }
1224 
1225 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1226 {
1227 	struct htb_sched *q = qdisc_priv(sch);
1228 	struct nlattr *nest;
1229 	struct tc_htb_glob gopt;
1230 
1231 	if (q->offload)
1232 		sch->flags |= TCQ_F_OFFLOADED;
1233 	else
1234 		sch->flags &= ~TCQ_F_OFFLOADED;
1235 
1236 	sch->qstats.overlimits = q->overlimits;
1237 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1238 	 * no change can happen on the qdisc parameters.
1239 	 */
1240 
1241 	gopt.direct_pkts = q->direct_pkts;
1242 	gopt.version = HTB_VER;
1243 	gopt.rate2quantum = q->rate2quantum;
1244 	gopt.defcls = q->defcls;
1245 	gopt.debug = 0;
1246 
1247 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1248 	if (nest == NULL)
1249 		goto nla_put_failure;
1250 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1251 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1252 		goto nla_put_failure;
1253 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1254 		goto nla_put_failure;
1255 
1256 	return nla_nest_end(skb, nest);
1257 
1258 nla_put_failure:
1259 	nla_nest_cancel(skb, nest);
1260 	return -1;
1261 }
1262 
1263 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1264 			  struct sk_buff *skb, struct tcmsg *tcm)
1265 {
1266 	struct htb_class *cl = (struct htb_class *)arg;
1267 	struct htb_sched *q = qdisc_priv(sch);
1268 	struct nlattr *nest;
1269 	struct tc_htb_opt opt;
1270 
1271 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1272 	 * no change can happen on the class parameters.
1273 	 */
1274 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1275 	tcm->tcm_handle = cl->common.classid;
1276 	if (!cl->level && cl->leaf.q)
1277 		tcm->tcm_info = cl->leaf.q->handle;
1278 
1279 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1280 	if (nest == NULL)
1281 		goto nla_put_failure;
1282 
1283 	memset(&opt, 0, sizeof(opt));
1284 
1285 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1286 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1287 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1288 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1289 	opt.quantum = cl->quantum;
1290 	opt.prio = cl->prio;
1291 	opt.level = cl->level;
1292 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1293 		goto nla_put_failure;
1294 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1295 		goto nla_put_failure;
1296 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1297 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1298 			      TCA_HTB_PAD))
1299 		goto nla_put_failure;
1300 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1301 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1302 			      TCA_HTB_PAD))
1303 		goto nla_put_failure;
1304 
1305 	return nla_nest_end(skb, nest);
1306 
1307 nla_put_failure:
1308 	nla_nest_cancel(skb, nest);
1309 	return -1;
1310 }
1311 
1312 static void htb_offload_aggregate_stats(struct htb_sched *q,
1313 					struct htb_class *cl)
1314 {
1315 	u64 bytes = 0, packets = 0;
1316 	struct htb_class *c;
1317 	unsigned int i;
1318 
1319 	gnet_stats_basic_sync_init(&cl->bstats);
1320 
1321 	for (i = 0; i < q->clhash.hashsize; i++) {
1322 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1323 			struct htb_class *p = c;
1324 
1325 			while (p && p->level < cl->level)
1326 				p = p->parent;
1327 
1328 			if (p != cl)
1329 				continue;
1330 
1331 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1332 			packets += u64_stats_read(&c->bstats_bias.packets);
1333 			if (c->level == 0) {
1334 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1335 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1336 			}
1337 		}
1338 	}
1339 	_bstats_update(&cl->bstats, bytes, packets);
1340 }
1341 
1342 static int
1343 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1344 {
1345 	struct htb_class *cl = (struct htb_class *)arg;
1346 	struct htb_sched *q = qdisc_priv(sch);
1347 	struct gnet_stats_queue qs = {
1348 		.drops = cl->drops,
1349 		.overlimits = cl->overlimits,
1350 	};
1351 	__u32 qlen = 0;
1352 
1353 	if (!cl->level && cl->leaf.q)
1354 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1355 
1356 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1357 				    INT_MIN, INT_MAX);
1358 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1359 				     INT_MIN, INT_MAX);
1360 
1361 	if (q->offload) {
1362 		if (!cl->level) {
1363 			if (cl->leaf.q)
1364 				cl->bstats = cl->leaf.q->bstats;
1365 			else
1366 				gnet_stats_basic_sync_init(&cl->bstats);
1367 			_bstats_update(&cl->bstats,
1368 				       u64_stats_read(&cl->bstats_bias.bytes),
1369 				       u64_stats_read(&cl->bstats_bias.packets));
1370 		} else {
1371 			htb_offload_aggregate_stats(q, cl);
1372 		}
1373 	}
1374 
1375 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1376 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1377 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1378 		return -1;
1379 
1380 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1381 }
1382 
1383 static struct netdev_queue *
1384 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1385 {
1386 	struct net_device *dev = qdisc_dev(sch);
1387 	struct tc_htb_qopt_offload offload_opt;
1388 	struct htb_sched *q = qdisc_priv(sch);
1389 	int err;
1390 
1391 	if (!q->offload)
1392 		return sch->dev_queue;
1393 
1394 	offload_opt = (struct tc_htb_qopt_offload) {
1395 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1396 		.classid = TC_H_MIN(tcm->tcm_parent),
1397 	};
1398 	err = htb_offload(dev, &offload_opt);
1399 	if (err || offload_opt.qid >= dev->num_tx_queues)
1400 		return NULL;
1401 	return netdev_get_tx_queue(dev, offload_opt.qid);
1402 }
1403 
1404 static struct Qdisc *
1405 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1406 {
1407 	struct net_device *dev = dev_queue->dev;
1408 	struct Qdisc *old_q;
1409 
1410 	if (dev->flags & IFF_UP)
1411 		dev_deactivate(dev);
1412 	old_q = dev_graft_qdisc(dev_queue, new_q);
1413 	if (new_q)
1414 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1415 	if (dev->flags & IFF_UP)
1416 		dev_activate(dev);
1417 
1418 	return old_q;
1419 }
1420 
1421 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1422 {
1423 	struct netdev_queue *queue;
1424 
1425 	queue = cl->leaf.offload_queue;
1426 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1427 		WARN_ON(cl->leaf.q->dev_queue != queue);
1428 
1429 	return queue;
1430 }
1431 
1432 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1433 				   struct htb_class *cl_new, bool destroying)
1434 {
1435 	struct netdev_queue *queue_old, *queue_new;
1436 	struct net_device *dev = qdisc_dev(sch);
1437 
1438 	queue_old = htb_offload_get_queue(cl_old);
1439 	queue_new = htb_offload_get_queue(cl_new);
1440 
1441 	if (!destroying) {
1442 		struct Qdisc *qdisc;
1443 
1444 		if (dev->flags & IFF_UP)
1445 			dev_deactivate(dev);
1446 		qdisc = dev_graft_qdisc(queue_old, NULL);
1447 		WARN_ON(qdisc != cl_old->leaf.q);
1448 	}
1449 
1450 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1451 		cl_old->leaf.q->dev_queue = queue_new;
1452 	cl_old->leaf.offload_queue = queue_new;
1453 
1454 	if (!destroying) {
1455 		struct Qdisc *qdisc;
1456 
1457 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1458 		if (dev->flags & IFF_UP)
1459 			dev_activate(dev);
1460 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1461 	}
1462 }
1463 
1464 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1465 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1466 {
1467 	struct netdev_queue *dev_queue = sch->dev_queue;
1468 	struct htb_class *cl = (struct htb_class *)arg;
1469 	struct htb_sched *q = qdisc_priv(sch);
1470 	struct Qdisc *old_q;
1471 
1472 	if (cl->level)
1473 		return -EINVAL;
1474 
1475 	if (q->offload)
1476 		dev_queue = htb_offload_get_queue(cl);
1477 
1478 	if (!new) {
1479 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1480 					cl->common.classid, extack);
1481 		if (!new)
1482 			return -ENOBUFS;
1483 	}
1484 
1485 	if (q->offload) {
1486 		htb_set_lockdep_class_child(new);
1487 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1488 		qdisc_refcount_inc(new);
1489 		old_q = htb_graft_helper(dev_queue, new);
1490 	}
1491 
1492 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1493 
1494 	if (q->offload) {
1495 		WARN_ON(old_q != *old);
1496 		qdisc_put(old_q);
1497 	}
1498 
1499 	return 0;
1500 }
1501 
1502 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1503 {
1504 	struct htb_class *cl = (struct htb_class *)arg;
1505 	return !cl->level ? cl->leaf.q : NULL;
1506 }
1507 
1508 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1509 {
1510 	struct htb_class *cl = (struct htb_class *)arg;
1511 
1512 	htb_deactivate(qdisc_priv(sch), cl);
1513 }
1514 
1515 static inline int htb_parent_last_child(struct htb_class *cl)
1516 {
1517 	if (!cl->parent)
1518 		/* the root class */
1519 		return 0;
1520 	if (cl->parent->children > 1)
1521 		/* not the last child */
1522 		return 0;
1523 	return 1;
1524 }
1525 
1526 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1527 			       struct Qdisc *new_q)
1528 {
1529 	struct htb_sched *q = qdisc_priv(sch);
1530 	struct htb_class *parent = cl->parent;
1531 
1532 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1533 
1534 	if (parent->cmode != HTB_CAN_SEND)
1535 		htb_safe_rb_erase(&parent->pq_node,
1536 				  &q->hlevel[parent->level].wait_pq);
1537 
1538 	parent->level = 0;
1539 	memset(&parent->inner, 0, sizeof(parent->inner));
1540 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1541 	parent->tokens = parent->buffer;
1542 	parent->ctokens = parent->cbuffer;
1543 	parent->t_c = ktime_get_ns();
1544 	parent->cmode = HTB_CAN_SEND;
1545 	if (q->offload)
1546 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1547 }
1548 
1549 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1550 				       struct netdev_queue *dev_queue,
1551 				       struct Qdisc *new_q)
1552 {
1553 	struct Qdisc *old_q;
1554 
1555 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1556 	if (new_q)
1557 		qdisc_refcount_inc(new_q);
1558 	old_q = htb_graft_helper(dev_queue, new_q);
1559 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1560 }
1561 
1562 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1563 				     bool last_child, bool destroying,
1564 				     struct netlink_ext_ack *extack)
1565 {
1566 	struct tc_htb_qopt_offload offload_opt;
1567 	struct netdev_queue *dev_queue;
1568 	struct Qdisc *q = cl->leaf.q;
1569 	struct Qdisc *old = NULL;
1570 	int err;
1571 
1572 	if (cl->level)
1573 		return -EINVAL;
1574 
1575 	WARN_ON(!q);
1576 	dev_queue = htb_offload_get_queue(cl);
1577 	old = htb_graft_helper(dev_queue, NULL);
1578 	if (destroying)
1579 		/* Before HTB is destroyed, the kernel grafts noop_qdisc to
1580 		 * all queues.
1581 		 */
1582 		WARN_ON(!(old->flags & TCQ_F_BUILTIN));
1583 	else
1584 		WARN_ON(old != q);
1585 
1586 	if (cl->parent) {
1587 		_bstats_update(&cl->parent->bstats_bias,
1588 			       u64_stats_read(&q->bstats.bytes),
1589 			       u64_stats_read(&q->bstats.packets));
1590 	}
1591 
1592 	offload_opt = (struct tc_htb_qopt_offload) {
1593 		.command = !last_child ? TC_HTB_LEAF_DEL :
1594 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1595 			   TC_HTB_LEAF_DEL_LAST,
1596 		.classid = cl->common.classid,
1597 		.extack = extack,
1598 	};
1599 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1600 
1601 	if (!err || destroying)
1602 		qdisc_put(old);
1603 	else
1604 		htb_graft_helper(dev_queue, old);
1605 
1606 	if (last_child)
1607 		return err;
1608 
1609 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1610 		u32 classid = TC_H_MAJ(sch->handle) |
1611 			      TC_H_MIN(offload_opt.classid);
1612 		struct htb_class *moved_cl = htb_find(classid, sch);
1613 
1614 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1615 	}
1616 
1617 	return err;
1618 }
1619 
1620 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1621 {
1622 	if (!cl->level) {
1623 		WARN_ON(!cl->leaf.q);
1624 		qdisc_put(cl->leaf.q);
1625 	}
1626 	gen_kill_estimator(&cl->rate_est);
1627 	tcf_block_put(cl->block);
1628 	kfree(cl);
1629 }
1630 
1631 static void htb_destroy(struct Qdisc *sch)
1632 {
1633 	struct net_device *dev = qdisc_dev(sch);
1634 	struct tc_htb_qopt_offload offload_opt;
1635 	struct htb_sched *q = qdisc_priv(sch);
1636 	struct hlist_node *next;
1637 	bool nonempty, changed;
1638 	struct htb_class *cl;
1639 	unsigned int i;
1640 
1641 	cancel_work_sync(&q->work);
1642 	qdisc_watchdog_cancel(&q->watchdog);
1643 	/* This line used to be after htb_destroy_class call below
1644 	 * and surprisingly it worked in 2.4. But it must precede it
1645 	 * because filter need its target class alive to be able to call
1646 	 * unbind_filter on it (without Oops).
1647 	 */
1648 	tcf_block_put(q->block);
1649 
1650 	for (i = 0; i < q->clhash.hashsize; i++) {
1651 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1652 			tcf_block_put(cl->block);
1653 			cl->block = NULL;
1654 		}
1655 	}
1656 
1657 	do {
1658 		nonempty = false;
1659 		changed = false;
1660 		for (i = 0; i < q->clhash.hashsize; i++) {
1661 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1662 						  common.hnode) {
1663 				bool last_child;
1664 
1665 				if (!q->offload) {
1666 					htb_destroy_class(sch, cl);
1667 					continue;
1668 				}
1669 
1670 				nonempty = true;
1671 
1672 				if (cl->level)
1673 					continue;
1674 
1675 				changed = true;
1676 
1677 				last_child = htb_parent_last_child(cl);
1678 				htb_destroy_class_offload(sch, cl, last_child,
1679 							  true, NULL);
1680 				qdisc_class_hash_remove(&q->clhash,
1681 							&cl->common);
1682 				if (cl->parent)
1683 					cl->parent->children--;
1684 				if (last_child)
1685 					htb_parent_to_leaf(sch, cl, NULL);
1686 				htb_destroy_class(sch, cl);
1687 			}
1688 		}
1689 	} while (changed);
1690 	WARN_ON(nonempty);
1691 
1692 	qdisc_class_hash_destroy(&q->clhash);
1693 	__qdisc_reset_queue(&q->direct_queue);
1694 
1695 	if (!q->offload)
1696 		return;
1697 
1698 	offload_opt = (struct tc_htb_qopt_offload) {
1699 		.command = TC_HTB_DESTROY,
1700 	};
1701 	htb_offload(dev, &offload_opt);
1702 
1703 	if (!q->direct_qdiscs)
1704 		return;
1705 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1706 		qdisc_put(q->direct_qdiscs[i]);
1707 	kfree(q->direct_qdiscs);
1708 }
1709 
1710 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1711 		      struct netlink_ext_ack *extack)
1712 {
1713 	struct htb_sched *q = qdisc_priv(sch);
1714 	struct htb_class *cl = (struct htb_class *)arg;
1715 	struct Qdisc *new_q = NULL;
1716 	int last_child = 0;
1717 	int err;
1718 
1719 	/* TODO: why don't allow to delete subtree ? references ? does
1720 	 * tc subsys guarantee us that in htb_destroy it holds no class
1721 	 * refs so that we can remove children safely there ?
1722 	 */
1723 	if (cl->children || cl->filter_cnt)
1724 		return -EBUSY;
1725 
1726 	if (!cl->level && htb_parent_last_child(cl))
1727 		last_child = 1;
1728 
1729 	if (q->offload) {
1730 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1731 						extack);
1732 		if (err)
1733 			return err;
1734 	}
1735 
1736 	if (last_child) {
1737 		struct netdev_queue *dev_queue = sch->dev_queue;
1738 
1739 		if (q->offload)
1740 			dev_queue = htb_offload_get_queue(cl);
1741 
1742 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1743 					  cl->parent->common.classid,
1744 					  NULL);
1745 		if (q->offload) {
1746 			if (new_q)
1747 				htb_set_lockdep_class_child(new_q);
1748 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1749 		}
1750 	}
1751 
1752 	sch_tree_lock(sch);
1753 
1754 	if (!cl->level)
1755 		qdisc_purge_queue(cl->leaf.q);
1756 
1757 	/* delete from hash and active; remainder in destroy_class */
1758 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1759 	if (cl->parent)
1760 		cl->parent->children--;
1761 
1762 	if (cl->prio_activity)
1763 		htb_deactivate(q, cl);
1764 
1765 	if (cl->cmode != HTB_CAN_SEND)
1766 		htb_safe_rb_erase(&cl->pq_node,
1767 				  &q->hlevel[cl->level].wait_pq);
1768 
1769 	if (last_child)
1770 		htb_parent_to_leaf(sch, cl, new_q);
1771 
1772 	sch_tree_unlock(sch);
1773 
1774 	htb_destroy_class(sch, cl);
1775 	return 0;
1776 }
1777 
1778 static int htb_change_class(struct Qdisc *sch, u32 classid,
1779 			    u32 parentid, struct nlattr **tca,
1780 			    unsigned long *arg, struct netlink_ext_ack *extack)
1781 {
1782 	int err = -EINVAL;
1783 	struct htb_sched *q = qdisc_priv(sch);
1784 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1785 	struct tc_htb_qopt_offload offload_opt;
1786 	struct nlattr *opt = tca[TCA_OPTIONS];
1787 	struct nlattr *tb[TCA_HTB_MAX + 1];
1788 	struct Qdisc *parent_qdisc = NULL;
1789 	struct netdev_queue *dev_queue;
1790 	struct tc_htb_opt *hopt;
1791 	u64 rate64, ceil64;
1792 	int warn = 0;
1793 
1794 	/* extract all subattrs from opt attr */
1795 	if (!opt)
1796 		goto failure;
1797 
1798 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1799 					  NULL);
1800 	if (err < 0)
1801 		goto failure;
1802 
1803 	err = -EINVAL;
1804 	if (tb[TCA_HTB_PARMS] == NULL)
1805 		goto failure;
1806 
1807 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1808 
1809 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1810 	if (!hopt->rate.rate || !hopt->ceil.rate)
1811 		goto failure;
1812 
1813 	if (q->offload) {
1814 		/* Options not supported by the offload. */
1815 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1816 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1817 			goto failure;
1818 		}
1819 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1820 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1821 			goto failure;
1822 		}
1823 		if (hopt->quantum) {
1824 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter");
1825 			goto failure;
1826 		}
1827 		if (hopt->prio) {
1828 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter");
1829 			goto failure;
1830 		}
1831 	}
1832 
1833 	/* Keeping backward compatible with rate_table based iproute2 tc */
1834 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1835 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1836 					      NULL));
1837 
1838 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1839 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1840 					      NULL));
1841 
1842 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1843 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1844 
1845 	if (!cl) {		/* new class */
1846 		struct net_device *dev = qdisc_dev(sch);
1847 		struct Qdisc *new_q, *old_q;
1848 		int prio;
1849 		struct {
1850 			struct nlattr		nla;
1851 			struct gnet_estimator	opt;
1852 		} est = {
1853 			.nla = {
1854 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1855 				.nla_type	= TCA_RATE,
1856 			},
1857 			.opt = {
1858 				/* 4s interval, 16s averaging constant */
1859 				.interval	= 2,
1860 				.ewma_log	= 2,
1861 			},
1862 		};
1863 
1864 		/* check for valid classid */
1865 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1866 		    htb_find(classid, sch))
1867 			goto failure;
1868 
1869 		/* check maximal depth */
1870 		if (parent && parent->parent && parent->parent->level < 2) {
1871 			pr_err("htb: tree is too deep\n");
1872 			goto failure;
1873 		}
1874 		err = -ENOBUFS;
1875 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1876 		if (!cl)
1877 			goto failure;
1878 
1879 		gnet_stats_basic_sync_init(&cl->bstats);
1880 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1881 
1882 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1883 		if (err) {
1884 			kfree(cl);
1885 			goto failure;
1886 		}
1887 		if (htb_rate_est || tca[TCA_RATE]) {
1888 			err = gen_new_estimator(&cl->bstats, NULL,
1889 						&cl->rate_est,
1890 						NULL,
1891 						true,
1892 						tca[TCA_RATE] ? : &est.nla);
1893 			if (err)
1894 				goto err_block_put;
1895 		}
1896 
1897 		cl->children = 0;
1898 		RB_CLEAR_NODE(&cl->pq_node);
1899 
1900 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1901 			RB_CLEAR_NODE(&cl->node[prio]);
1902 
1903 		cl->common.classid = classid;
1904 
1905 		/* Make sure nothing interrupts us in between of two
1906 		 * ndo_setup_tc calls.
1907 		 */
1908 		ASSERT_RTNL();
1909 
1910 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1911 		 * so that can't be used inside of sch_tree_lock
1912 		 * -- thanks to Karlis Peisenieks
1913 		 */
1914 		if (!q->offload) {
1915 			dev_queue = sch->dev_queue;
1916 		} else if (!(parent && !parent->level)) {
1917 			/* Assign a dev_queue to this classid. */
1918 			offload_opt = (struct tc_htb_qopt_offload) {
1919 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1920 				.classid = cl->common.classid,
1921 				.parent_classid = parent ?
1922 					TC_H_MIN(parent->common.classid) :
1923 					TC_HTB_CLASSID_ROOT,
1924 				.rate = max_t(u64, hopt->rate.rate, rate64),
1925 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1926 				.extack = extack,
1927 			};
1928 			err = htb_offload(dev, &offload_opt);
1929 			if (err) {
1930 				pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n",
1931 				       err);
1932 				goto err_kill_estimator;
1933 			}
1934 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1935 		} else { /* First child. */
1936 			dev_queue = htb_offload_get_queue(parent);
1937 			old_q = htb_graft_helper(dev_queue, NULL);
1938 			WARN_ON(old_q != parent->leaf.q);
1939 			offload_opt = (struct tc_htb_qopt_offload) {
1940 				.command = TC_HTB_LEAF_TO_INNER,
1941 				.classid = cl->common.classid,
1942 				.parent_classid =
1943 					TC_H_MIN(parent->common.classid),
1944 				.rate = max_t(u64, hopt->rate.rate, rate64),
1945 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1946 				.extack = extack,
1947 			};
1948 			err = htb_offload(dev, &offload_opt);
1949 			if (err) {
1950 				pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n",
1951 				       err);
1952 				htb_graft_helper(dev_queue, old_q);
1953 				goto err_kill_estimator;
1954 			}
1955 			_bstats_update(&parent->bstats_bias,
1956 				       u64_stats_read(&old_q->bstats.bytes),
1957 				       u64_stats_read(&old_q->bstats.packets));
1958 			qdisc_put(old_q);
1959 		}
1960 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1961 					  classid, NULL);
1962 		if (q->offload) {
1963 			if (new_q) {
1964 				htb_set_lockdep_class_child(new_q);
1965 				/* One ref for cl->leaf.q, the other for
1966 				 * dev_queue->qdisc.
1967 				 */
1968 				qdisc_refcount_inc(new_q);
1969 			}
1970 			old_q = htb_graft_helper(dev_queue, new_q);
1971 			/* No qdisc_put needed. */
1972 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1973 		}
1974 		sch_tree_lock(sch);
1975 		if (parent && !parent->level) {
1976 			/* turn parent into inner node */
1977 			qdisc_purge_queue(parent->leaf.q);
1978 			parent_qdisc = parent->leaf.q;
1979 			if (parent->prio_activity)
1980 				htb_deactivate(q, parent);
1981 
1982 			/* remove from evt list because of level change */
1983 			if (parent->cmode != HTB_CAN_SEND) {
1984 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1985 				parent->cmode = HTB_CAN_SEND;
1986 			}
1987 			parent->level = (parent->parent ? parent->parent->level
1988 					 : TC_HTB_MAXDEPTH) - 1;
1989 			memset(&parent->inner, 0, sizeof(parent->inner));
1990 		}
1991 
1992 		/* leaf (we) needs elementary qdisc */
1993 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1994 		if (q->offload)
1995 			cl->leaf.offload_queue = dev_queue;
1996 
1997 		cl->parent = parent;
1998 
1999 		/* set class to be in HTB_CAN_SEND state */
2000 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
2001 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
2002 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
2003 		cl->t_c = ktime_get_ns();
2004 		cl->cmode = HTB_CAN_SEND;
2005 
2006 		/* attach to the hash list and parent's family */
2007 		qdisc_class_hash_insert(&q->clhash, &cl->common);
2008 		if (parent)
2009 			parent->children++;
2010 		if (cl->leaf.q != &noop_qdisc)
2011 			qdisc_hash_add(cl->leaf.q, true);
2012 	} else {
2013 		if (tca[TCA_RATE]) {
2014 			err = gen_replace_estimator(&cl->bstats, NULL,
2015 						    &cl->rate_est,
2016 						    NULL,
2017 						    true,
2018 						    tca[TCA_RATE]);
2019 			if (err)
2020 				return err;
2021 		}
2022 
2023 		if (q->offload) {
2024 			struct net_device *dev = qdisc_dev(sch);
2025 
2026 			offload_opt = (struct tc_htb_qopt_offload) {
2027 				.command = TC_HTB_NODE_MODIFY,
2028 				.classid = cl->common.classid,
2029 				.rate = max_t(u64, hopt->rate.rate, rate64),
2030 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2031 				.extack = extack,
2032 			};
2033 			err = htb_offload(dev, &offload_opt);
2034 			if (err)
2035 				/* Estimator was replaced, and rollback may fail
2036 				 * as well, so we don't try to recover it, and
2037 				 * the estimator won't work property with the
2038 				 * offload anyway, because bstats are updated
2039 				 * only when the stats are queried.
2040 				 */
2041 				return err;
2042 		}
2043 
2044 		sch_tree_lock(sch);
2045 	}
2046 
2047 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2048 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2049 
2050 	/* it used to be a nasty bug here, we have to check that node
2051 	 * is really leaf before changing cl->leaf !
2052 	 */
2053 	if (!cl->level) {
2054 		u64 quantum = cl->rate.rate_bytes_ps;
2055 
2056 		do_div(quantum, q->rate2quantum);
2057 		cl->quantum = min_t(u64, quantum, INT_MAX);
2058 
2059 		if (!hopt->quantum && cl->quantum < 1000) {
2060 			warn = -1;
2061 			cl->quantum = 1000;
2062 		}
2063 		if (!hopt->quantum && cl->quantum > 200000) {
2064 			warn = 1;
2065 			cl->quantum = 200000;
2066 		}
2067 		if (hopt->quantum)
2068 			cl->quantum = hopt->quantum;
2069 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2070 			cl->prio = TC_HTB_NUMPRIO - 1;
2071 	}
2072 
2073 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2074 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2075 
2076 	sch_tree_unlock(sch);
2077 	qdisc_put(parent_qdisc);
2078 
2079 	if (warn)
2080 		pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
2081 			    cl->common.classid, (warn == -1 ? "small" : "big"));
2082 
2083 	qdisc_class_hash_grow(sch, &q->clhash);
2084 
2085 	*arg = (unsigned long)cl;
2086 	return 0;
2087 
2088 err_kill_estimator:
2089 	gen_kill_estimator(&cl->rate_est);
2090 err_block_put:
2091 	tcf_block_put(cl->block);
2092 	kfree(cl);
2093 failure:
2094 	return err;
2095 }
2096 
2097 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2098 				       struct netlink_ext_ack *extack)
2099 {
2100 	struct htb_sched *q = qdisc_priv(sch);
2101 	struct htb_class *cl = (struct htb_class *)arg;
2102 
2103 	return cl ? cl->block : q->block;
2104 }
2105 
2106 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2107 				     u32 classid)
2108 {
2109 	struct htb_class *cl = htb_find(classid, sch);
2110 
2111 	/*if (cl && !cl->level) return 0;
2112 	 * The line above used to be there to prevent attaching filters to
2113 	 * leaves. But at least tc_index filter uses this just to get class
2114 	 * for other reasons so that we have to allow for it.
2115 	 * ----
2116 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2117 	 * another way to "lock" the class - unlike "get" this lock can
2118 	 * be broken by class during destroy IIUC.
2119 	 */
2120 	if (cl)
2121 		cl->filter_cnt++;
2122 	return (unsigned long)cl;
2123 }
2124 
2125 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2126 {
2127 	struct htb_class *cl = (struct htb_class *)arg;
2128 
2129 	if (cl)
2130 		cl->filter_cnt--;
2131 }
2132 
2133 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2134 {
2135 	struct htb_sched *q = qdisc_priv(sch);
2136 	struct htb_class *cl;
2137 	unsigned int i;
2138 
2139 	if (arg->stop)
2140 		return;
2141 
2142 	for (i = 0; i < q->clhash.hashsize; i++) {
2143 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2144 			if (arg->count < arg->skip) {
2145 				arg->count++;
2146 				continue;
2147 			}
2148 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2149 				arg->stop = 1;
2150 				return;
2151 			}
2152 			arg->count++;
2153 		}
2154 	}
2155 }
2156 
2157 static const struct Qdisc_class_ops htb_class_ops = {
2158 	.select_queue	=	htb_select_queue,
2159 	.graft		=	htb_graft,
2160 	.leaf		=	htb_leaf,
2161 	.qlen_notify	=	htb_qlen_notify,
2162 	.find		=	htb_search,
2163 	.change		=	htb_change_class,
2164 	.delete		=	htb_delete,
2165 	.walk		=	htb_walk,
2166 	.tcf_block	=	htb_tcf_block,
2167 	.bind_tcf	=	htb_bind_filter,
2168 	.unbind_tcf	=	htb_unbind_filter,
2169 	.dump		=	htb_dump_class,
2170 	.dump_stats	=	htb_dump_class_stats,
2171 };
2172 
2173 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2174 	.cl_ops		=	&htb_class_ops,
2175 	.id		=	"htb",
2176 	.priv_size	=	sizeof(struct htb_sched),
2177 	.enqueue	=	htb_enqueue,
2178 	.dequeue	=	htb_dequeue,
2179 	.peek		=	qdisc_peek_dequeued,
2180 	.init		=	htb_init,
2181 	.attach		=	htb_attach,
2182 	.reset		=	htb_reset,
2183 	.destroy	=	htb_destroy,
2184 	.dump		=	htb_dump,
2185 	.owner		=	THIS_MODULE,
2186 };
2187 
2188 static int __init htb_module_init(void)
2189 {
2190 	return register_qdisc(&htb_qdisc_ops);
2191 }
2192 static void __exit htb_module_exit(void)
2193 {
2194 	unregister_qdisc(&htb_qdisc_ops);
2195 }
2196 
2197 module_init(htb_module_init)
2198 module_exit(htb_module_exit)
2199 MODULE_LICENSE("GPL");
2200