xref: /openbmc/linux/net/sched/sch_htb.c (revision e761cc20)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 	int			filter_cnt;
106 
107 	int			level;		/* our level (see above) */
108 	unsigned int		children;
109 	struct htb_class	*parent;	/* parent class */
110 
111 	struct net_rate_estimator __rcu *rate_est;
112 
113 	/*
114 	 * Written often fields
115 	 */
116 	struct gnet_stats_basic_sync bstats;
117 	struct gnet_stats_basic_sync bstats_bias;
118 	struct tc_htb_xstats	xstats;	/* our special stats */
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 
203 #define HTB_DIRECT ((struct htb_class *)-1L)
204 
205 /**
206  * htb_classify - classify a packet into class
207  * @skb: the socket buffer
208  * @sch: the active queue discipline
209  * @qerr: pointer for returned status code
210  *
211  * It returns NULL if the packet should be dropped or -1 if the packet
212  * should be passed directly thru. In all other cases leaf class is returned.
213  * We allow direct class selection by classid in priority. The we examine
214  * filters in qdisc and in inner nodes (if higher filter points to the inner
215  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
216  * internal fifo (direct). These packets then go directly thru. If we still
217  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
218  * then finish and return direct queue.
219  */
220 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
221 				      int *qerr)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct htb_class *cl;
225 	struct tcf_result res;
226 	struct tcf_proto *tcf;
227 	int result;
228 
229 	/* allow to select class by setting skb->priority to valid classid;
230 	 * note that nfmark can be used too by attaching filter fw with no
231 	 * rules in it
232 	 */
233 	if (skb->priority == sch->handle)
234 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
235 	cl = htb_find(skb->priority, sch);
236 	if (cl) {
237 		if (cl->level == 0)
238 			return cl;
239 		/* Start with inner filter chain if a non-leaf class is selected */
240 		tcf = rcu_dereference_bh(cl->filter_list);
241 	} else {
242 		tcf = rcu_dereference_bh(q->filter_list);
243 	}
244 
245 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
246 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		}
257 #endif
258 		cl = (void *)res.class;
259 		if (!cl) {
260 			if (res.classid == sch->handle)
261 				return HTB_DIRECT;	/* X:0 (direct flow) */
262 			cl = htb_find(res.classid, sch);
263 			if (!cl)
264 				break;	/* filter selected invalid classid */
265 		}
266 		if (!cl->level)
267 			return cl;	/* we hit leaf; return it */
268 
269 		/* we have got inner class; apply inner filter chain */
270 		tcf = rcu_dereference_bh(cl->filter_list);
271 	}
272 	/* classification failed; try to use default class */
273 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
274 	if (!cl || cl->level)
275 		return HTB_DIRECT;	/* bad default .. this is safe bet */
276 	return cl;
277 }
278 
279 /**
280  * htb_add_to_id_tree - adds class to the round robin list
281  * @root: the root of the tree
282  * @cl: the class to add
283  * @prio: the give prio in class
284  *
285  * Routine adds class to the list (actually tree) sorted by classid.
286  * Make sure that class is not already on such list for given prio.
287  */
288 static void htb_add_to_id_tree(struct rb_root *root,
289 			       struct htb_class *cl, int prio)
290 {
291 	struct rb_node **p = &root->rb_node, *parent = NULL;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, node[prio]);
297 
298 		if (cl->common.classid > c->common.classid)
299 			p = &parent->rb_right;
300 		else
301 			p = &parent->rb_left;
302 	}
303 	rb_link_node(&cl->node[prio], parent, p);
304 	rb_insert_color(&cl->node[prio], root);
305 }
306 
307 /**
308  * htb_add_to_wait_tree - adds class to the event queue with delay
309  * @q: the priority event queue
310  * @cl: the class to add
311  * @delay: delay in microseconds
312  *
313  * The class is added to priority event queue to indicate that class will
314  * change its mode in cl->pq_key microseconds. Make sure that class is not
315  * already in the queue.
316  */
317 static void htb_add_to_wait_tree(struct htb_sched *q,
318 				 struct htb_class *cl, s64 delay)
319 {
320 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
321 
322 	cl->pq_key = q->now + delay;
323 	if (cl->pq_key == q->now)
324 		cl->pq_key++;
325 
326 	/* update the nearest event cache */
327 	if (q->near_ev_cache[cl->level] > cl->pq_key)
328 		q->near_ev_cache[cl->level] = cl->pq_key;
329 
330 	while (*p) {
331 		struct htb_class *c;
332 		parent = *p;
333 		c = rb_entry(parent, struct htb_class, pq_node);
334 		if (cl->pq_key >= c->pq_key)
335 			p = &parent->rb_right;
336 		else
337 			p = &parent->rb_left;
338 	}
339 	rb_link_node(&cl->pq_node, parent, p);
340 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
341 }
342 
343 /**
344  * htb_next_rb_node - finds next node in binary tree
345  * @n: the current node in binary tree
346  *
347  * When we are past last key we return NULL.
348  * Average complexity is 2 steps per call.
349  */
350 static inline void htb_next_rb_node(struct rb_node **n)
351 {
352 	*n = rb_next(*n);
353 }
354 
355 /**
356  * htb_add_class_to_row - add class to its row
357  * @q: the priority event queue
358  * @cl: the class to add
359  * @mask: the given priorities in class in bitmap
360  *
361  * The class is added to row at priorities marked in mask.
362  * It does nothing if mask == 0.
363  */
364 static inline void htb_add_class_to_row(struct htb_sched *q,
365 					struct htb_class *cl, int mask)
366 {
367 	q->row_mask[cl->level] |= mask;
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		mask &= ~(1 << prio);
371 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
372 	}
373 }
374 
375 /* If this triggers, it is a bug in this code, but it need not be fatal */
376 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
377 {
378 	if (RB_EMPTY_NODE(rb)) {
379 		WARN_ON(1);
380 	} else {
381 		rb_erase(rb, root);
382 		RB_CLEAR_NODE(rb);
383 	}
384 }
385 
386 
387 /**
388  * htb_remove_class_from_row - removes class from its row
389  * @q: the priority event queue
390  * @cl: the class to add
391  * @mask: the given priorities in class in bitmap
392  *
393  * The class is removed from row at priorities marked in mask.
394  * It does nothing if mask == 0.
395  */
396 static inline void htb_remove_class_from_row(struct htb_sched *q,
397 						 struct htb_class *cl, int mask)
398 {
399 	int m = 0;
400 	struct htb_level *hlevel = &q->hlevel[cl->level];
401 
402 	while (mask) {
403 		int prio = ffz(~mask);
404 		struct htb_prio *hprio = &hlevel->hprio[prio];
405 
406 		mask &= ~(1 << prio);
407 		if (hprio->ptr == cl->node + prio)
408 			htb_next_rb_node(&hprio->ptr);
409 
410 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
411 		if (!hprio->row.rb_node)
412 			m |= 1 << prio;
413 	}
414 	q->row_mask[cl->level] &= ~m;
415 }
416 
417 /**
418  * htb_activate_prios - creates active classe's feed chain
419  * @q: the priority event queue
420  * @cl: the class to activate
421  *
422  * The class is connected to ancestors and/or appropriate rows
423  * for priorities it is participating on. cl->cmode must be new
424  * (activated) mode. It does nothing if cl->prio_activity == 0.
425  */
426 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
427 {
428 	struct htb_class *p = cl->parent;
429 	long m, mask = cl->prio_activity;
430 
431 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
432 		m = mask;
433 		while (m) {
434 			int prio = ffz(~m);
435 			m &= ~(1 << prio);
436 
437 			if (p->inner.clprio[prio].feed.rb_node)
438 				/* parent already has its feed in use so that
439 				 * reset bit in mask as parent is already ok
440 				 */
441 				mask &= ~(1 << prio);
442 
443 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
444 		}
445 		p->prio_activity |= mask;
446 		cl = p;
447 		p = cl->parent;
448 
449 	}
450 	if (cl->cmode == HTB_CAN_SEND && mask)
451 		htb_add_class_to_row(q, cl, mask);
452 }
453 
454 /**
455  * htb_deactivate_prios - remove class from feed chain
456  * @q: the priority event queue
457  * @cl: the class to deactivate
458  *
459  * cl->cmode must represent old mode (before deactivation). It does
460  * nothing if cl->prio_activity == 0. Class is removed from all feed
461  * chains and rows.
462  */
463 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
464 {
465 	struct htb_class *p = cl->parent;
466 	long m, mask = cl->prio_activity;
467 
468 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
469 		m = mask;
470 		mask = 0;
471 		while (m) {
472 			int prio = ffz(~m);
473 			m &= ~(1 << prio);
474 
475 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
476 				/* we are removing child which is pointed to from
477 				 * parent feed - forget the pointer but remember
478 				 * classid
479 				 */
480 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
481 				p->inner.clprio[prio].ptr = NULL;
482 			}
483 
484 			htb_safe_rb_erase(cl->node + prio,
485 					  &p->inner.clprio[prio].feed);
486 
487 			if (!p->inner.clprio[prio].feed.rb_node)
488 				mask |= 1 << prio;
489 		}
490 
491 		p->prio_activity &= ~mask;
492 		cl = p;
493 		p = cl->parent;
494 
495 	}
496 	if (cl->cmode == HTB_CAN_SEND && mask)
497 		htb_remove_class_from_row(q, cl, mask);
498 }
499 
500 static inline s64 htb_lowater(const struct htb_class *cl)
501 {
502 	if (htb_hysteresis)
503 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
504 	else
505 		return 0;
506 }
507 static inline s64 htb_hiwater(const struct htb_class *cl)
508 {
509 	if (htb_hysteresis)
510 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
511 	else
512 		return 0;
513 }
514 
515 
516 /**
517  * htb_class_mode - computes and returns current class mode
518  * @cl: the target class
519  * @diff: diff time in microseconds
520  *
521  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
522  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
523  * from now to time when cl will change its state.
524  * Also it is worth to note that class mode doesn't change simply
525  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
526  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
527  * mode transitions per time unit. The speed gain is about 1/6.
528  */
529 static inline enum htb_cmode
530 htb_class_mode(struct htb_class *cl, s64 *diff)
531 {
532 	s64 toks;
533 
534 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
535 		*diff = -toks;
536 		return HTB_CANT_SEND;
537 	}
538 
539 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
540 		return HTB_CAN_SEND;
541 
542 	*diff = -toks;
543 	return HTB_MAY_BORROW;
544 }
545 
546 /**
547  * htb_change_class_mode - changes classe's mode
548  * @q: the priority event queue
549  * @cl: the target class
550  * @diff: diff time in microseconds
551  *
552  * This should be the only way how to change classe's mode under normal
553  * circumstances. Routine will update feed lists linkage, change mode
554  * and add class to the wait event queue if appropriate. New mode should
555  * be different from old one and cl->pq_key has to be valid if changing
556  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
557  */
558 static void
559 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
560 {
561 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
562 
563 	if (new_mode == cl->cmode)
564 		return;
565 
566 	if (new_mode == HTB_CANT_SEND) {
567 		cl->overlimits++;
568 		q->overlimits++;
569 	}
570 
571 	if (cl->prio_activity) {	/* not necessary: speed optimization */
572 		if (cl->cmode != HTB_CANT_SEND)
573 			htb_deactivate_prios(q, cl);
574 		cl->cmode = new_mode;
575 		if (new_mode != HTB_CANT_SEND)
576 			htb_activate_prios(q, cl);
577 	} else
578 		cl->cmode = new_mode;
579 }
580 
581 /**
582  * htb_activate - inserts leaf cl into appropriate active feeds
583  * @q: the priority event queue
584  * @cl: the target class
585  *
586  * Routine learns (new) priority of leaf and activates feed chain
587  * for the prio. It can be called on already active leaf safely.
588  * It also adds leaf into droplist.
589  */
590 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
591 {
592 	WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
593 
594 	if (!cl->prio_activity) {
595 		cl->prio_activity = 1 << cl->prio;
596 		htb_activate_prios(q, cl);
597 	}
598 }
599 
600 /**
601  * htb_deactivate - remove leaf cl from active feeds
602  * @q: the priority event queue
603  * @cl: the target class
604  *
605  * Make sure that leaf is active. In the other words it can't be called
606  * with non-active leaf. It also removes class from the drop list.
607  */
608 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
609 {
610 	WARN_ON(!cl->prio_activity);
611 
612 	htb_deactivate_prios(q, cl);
613 	cl->prio_activity = 0;
614 }
615 
616 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
617 		       struct sk_buff **to_free)
618 {
619 	int ret;
620 	unsigned int len = qdisc_pkt_len(skb);
621 	struct htb_sched *q = qdisc_priv(sch);
622 	struct htb_class *cl = htb_classify(skb, sch, &ret);
623 
624 	if (cl == HTB_DIRECT) {
625 		/* enqueue to helper queue */
626 		if (q->direct_queue.qlen < q->direct_qlen) {
627 			__qdisc_enqueue_tail(skb, &q->direct_queue);
628 			q->direct_pkts++;
629 		} else {
630 			return qdisc_drop(skb, sch, to_free);
631 		}
632 #ifdef CONFIG_NET_CLS_ACT
633 	} else if (!cl) {
634 		if (ret & __NET_XMIT_BYPASS)
635 			qdisc_qstats_drop(sch);
636 		__qdisc_drop(skb, to_free);
637 		return ret;
638 #endif
639 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
640 					to_free)) != NET_XMIT_SUCCESS) {
641 		if (net_xmit_drop_count(ret)) {
642 			qdisc_qstats_drop(sch);
643 			cl->drops++;
644 		}
645 		return ret;
646 	} else {
647 		htb_activate(q, cl);
648 	}
649 
650 	sch->qstats.backlog += len;
651 	sch->q.qlen++;
652 	return NET_XMIT_SUCCESS;
653 }
654 
655 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
656 {
657 	s64 toks = diff + cl->tokens;
658 
659 	if (toks > cl->buffer)
660 		toks = cl->buffer;
661 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
662 	if (toks <= -cl->mbuffer)
663 		toks = 1 - cl->mbuffer;
664 
665 	cl->tokens = toks;
666 }
667 
668 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
669 {
670 	s64 toks = diff + cl->ctokens;
671 
672 	if (toks > cl->cbuffer)
673 		toks = cl->cbuffer;
674 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
675 	if (toks <= -cl->mbuffer)
676 		toks = 1 - cl->mbuffer;
677 
678 	cl->ctokens = toks;
679 }
680 
681 /**
682  * htb_charge_class - charges amount "bytes" to leaf and ancestors
683  * @q: the priority event queue
684  * @cl: the class to start iterate
685  * @level: the minimum level to account
686  * @skb: the socket buffer
687  *
688  * Routine assumes that packet "bytes" long was dequeued from leaf cl
689  * borrowing from "level". It accounts bytes to ceil leaky bucket for
690  * leaf and all ancestors and to rate bucket for ancestors at levels
691  * "level" and higher. It also handles possible change of mode resulting
692  * from the update. Note that mode can also increase here (MAY_BORROW to
693  * CAN_SEND) because we can use more precise clock that event queue here.
694  * In such case we remove class from event queue first.
695  */
696 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
697 			     int level, struct sk_buff *skb)
698 {
699 	int bytes = qdisc_pkt_len(skb);
700 	enum htb_cmode old_mode;
701 	s64 diff;
702 
703 	while (cl) {
704 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
705 		if (cl->level >= level) {
706 			if (cl->level == level)
707 				cl->xstats.lends++;
708 			htb_accnt_tokens(cl, bytes, diff);
709 		} else {
710 			cl->xstats.borrows++;
711 			cl->tokens += diff;	/* we moved t_c; update tokens */
712 		}
713 		htb_accnt_ctokens(cl, bytes, diff);
714 		cl->t_c = q->now;
715 
716 		old_mode = cl->cmode;
717 		diff = 0;
718 		htb_change_class_mode(q, cl, &diff);
719 		if (old_mode != cl->cmode) {
720 			if (old_mode != HTB_CAN_SEND)
721 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
722 			if (cl->cmode != HTB_CAN_SEND)
723 				htb_add_to_wait_tree(q, cl, diff);
724 		}
725 
726 		/* update basic stats except for leaves which are already updated */
727 		if (cl->level)
728 			bstats_update(&cl->bstats, skb);
729 
730 		cl = cl->parent;
731 	}
732 }
733 
734 /**
735  * htb_do_events - make mode changes to classes at the level
736  * @q: the priority event queue
737  * @level: which wait_pq in 'q->hlevel'
738  * @start: start jiffies
739  *
740  * Scans event queue for pending events and applies them. Returns time of
741  * next pending event (0 for no event in pq, q->now for too many events).
742  * Note: Applied are events whose have cl->pq_key <= q->now.
743  */
744 static s64 htb_do_events(struct htb_sched *q, const int level,
745 			 unsigned long start)
746 {
747 	/* don't run for longer than 2 jiffies; 2 is used instead of
748 	 * 1 to simplify things when jiffy is going to be incremented
749 	 * too soon
750 	 */
751 	unsigned long stop_at = start + 2;
752 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
753 
754 	while (time_before(jiffies, stop_at)) {
755 		struct htb_class *cl;
756 		s64 diff;
757 		struct rb_node *p = rb_first(wait_pq);
758 
759 		if (!p)
760 			return 0;
761 
762 		cl = rb_entry(p, struct htb_class, pq_node);
763 		if (cl->pq_key > q->now)
764 			return cl->pq_key;
765 
766 		htb_safe_rb_erase(p, wait_pq);
767 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
768 		htb_change_class_mode(q, cl, &diff);
769 		if (cl->cmode != HTB_CAN_SEND)
770 			htb_add_to_wait_tree(q, cl, diff);
771 	}
772 
773 	/* too much load - let's continue after a break for scheduling */
774 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
775 		pr_warn("htb: too many events!\n");
776 		q->warned |= HTB_WARN_TOOMANYEVENTS;
777 	}
778 
779 	return q->now;
780 }
781 
782 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
783  * is no such one exists.
784  */
785 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
786 					      u32 id)
787 {
788 	struct rb_node *r = NULL;
789 	while (n) {
790 		struct htb_class *cl =
791 		    rb_entry(n, struct htb_class, node[prio]);
792 
793 		if (id > cl->common.classid) {
794 			n = n->rb_right;
795 		} else if (id < cl->common.classid) {
796 			r = n;
797 			n = n->rb_left;
798 		} else {
799 			return n;
800 		}
801 	}
802 	return r;
803 }
804 
805 /**
806  * htb_lookup_leaf - returns next leaf class in DRR order
807  * @hprio: the current one
808  * @prio: which prio in class
809  *
810  * Find leaf where current feed pointers points to.
811  */
812 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
813 {
814 	int i;
815 	struct {
816 		struct rb_node *root;
817 		struct rb_node **pptr;
818 		u32 *pid;
819 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
820 
821 	BUG_ON(!hprio->row.rb_node);
822 	sp->root = hprio->row.rb_node;
823 	sp->pptr = &hprio->ptr;
824 	sp->pid = &hprio->last_ptr_id;
825 
826 	for (i = 0; i < 65535; i++) {
827 		if (!*sp->pptr && *sp->pid) {
828 			/* ptr was invalidated but id is valid - try to recover
829 			 * the original or next ptr
830 			 */
831 			*sp->pptr =
832 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
833 		}
834 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
835 				 * can become out of date quickly
836 				 */
837 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
838 			*sp->pptr = sp->root;
839 			while ((*sp->pptr)->rb_left)
840 				*sp->pptr = (*sp->pptr)->rb_left;
841 			if (sp > stk) {
842 				sp--;
843 				if (!*sp->pptr) {
844 					WARN_ON(1);
845 					return NULL;
846 				}
847 				htb_next_rb_node(sp->pptr);
848 			}
849 		} else {
850 			struct htb_class *cl;
851 			struct htb_prio *clp;
852 
853 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
854 			if (!cl->level)
855 				return cl;
856 			clp = &cl->inner.clprio[prio];
857 			(++sp)->root = clp->feed.rb_node;
858 			sp->pptr = &clp->ptr;
859 			sp->pid = &clp->last_ptr_id;
860 		}
861 	}
862 	WARN_ON(1);
863 	return NULL;
864 }
865 
866 /* dequeues packet at given priority and level; call only if
867  * you are sure that there is active class at prio/level
868  */
869 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
870 					const int level)
871 {
872 	struct sk_buff *skb = NULL;
873 	struct htb_class *cl, *start;
874 	struct htb_level *hlevel = &q->hlevel[level];
875 	struct htb_prio *hprio = &hlevel->hprio[prio];
876 
877 	/* look initial class up in the row */
878 	start = cl = htb_lookup_leaf(hprio, prio);
879 
880 	do {
881 next:
882 		if (unlikely(!cl))
883 			return NULL;
884 
885 		/* class can be empty - it is unlikely but can be true if leaf
886 		 * qdisc drops packets in enqueue routine or if someone used
887 		 * graft operation on the leaf since last dequeue;
888 		 * simply deactivate and skip such class
889 		 */
890 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
891 			struct htb_class *next;
892 			htb_deactivate(q, cl);
893 
894 			/* row/level might become empty */
895 			if ((q->row_mask[level] & (1 << prio)) == 0)
896 				return NULL;
897 
898 			next = htb_lookup_leaf(hprio, prio);
899 
900 			if (cl == start)	/* fix start if we just deleted it */
901 				start = next;
902 			cl = next;
903 			goto next;
904 		}
905 
906 		skb = cl->leaf.q->dequeue(cl->leaf.q);
907 		if (likely(skb != NULL))
908 			break;
909 
910 		qdisc_warn_nonwc("htb", cl->leaf.q);
911 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
912 					 &q->hlevel[0].hprio[prio].ptr);
913 		cl = htb_lookup_leaf(hprio, prio);
914 
915 	} while (cl != start);
916 
917 	if (likely(skb != NULL)) {
918 		bstats_update(&cl->bstats, skb);
919 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
920 		if (cl->leaf.deficit[level] < 0) {
921 			cl->leaf.deficit[level] += cl->quantum;
922 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
923 						 &q->hlevel[0].hprio[prio].ptr);
924 		}
925 		/* this used to be after charge_class but this constelation
926 		 * gives us slightly better performance
927 		 */
928 		if (!cl->leaf.q->q.qlen)
929 			htb_deactivate(q, cl);
930 		htb_charge_class(q, cl, level, skb);
931 	}
932 	return skb;
933 }
934 
935 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
936 {
937 	struct sk_buff *skb;
938 	struct htb_sched *q = qdisc_priv(sch);
939 	int level;
940 	s64 next_event;
941 	unsigned long start_at;
942 
943 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
944 	skb = __qdisc_dequeue_head(&q->direct_queue);
945 	if (skb != NULL) {
946 ok:
947 		qdisc_bstats_update(sch, skb);
948 		qdisc_qstats_backlog_dec(sch, skb);
949 		sch->q.qlen--;
950 		return skb;
951 	}
952 
953 	if (!sch->q.qlen)
954 		goto fin;
955 	q->now = ktime_get_ns();
956 	start_at = jiffies;
957 
958 	next_event = q->now + 5LLU * NSEC_PER_SEC;
959 
960 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
961 		/* common case optimization - skip event handler quickly */
962 		int m;
963 		s64 event = q->near_ev_cache[level];
964 
965 		if (q->now >= event) {
966 			event = htb_do_events(q, level, start_at);
967 			if (!event)
968 				event = q->now + NSEC_PER_SEC;
969 			q->near_ev_cache[level] = event;
970 		}
971 
972 		if (next_event > event)
973 			next_event = event;
974 
975 		m = ~q->row_mask[level];
976 		while (m != (int)(-1)) {
977 			int prio = ffz(m);
978 
979 			m |= 1 << prio;
980 			skb = htb_dequeue_tree(q, prio, level);
981 			if (likely(skb != NULL))
982 				goto ok;
983 		}
984 	}
985 	if (likely(next_event > q->now))
986 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
987 	else
988 		schedule_work(&q->work);
989 fin:
990 	return skb;
991 }
992 
993 /* reset all classes */
994 /* always caled under BH & queue lock */
995 static void htb_reset(struct Qdisc *sch)
996 {
997 	struct htb_sched *q = qdisc_priv(sch);
998 	struct htb_class *cl;
999 	unsigned int i;
1000 
1001 	for (i = 0; i < q->clhash.hashsize; i++) {
1002 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1003 			if (cl->level)
1004 				memset(&cl->inner, 0, sizeof(cl->inner));
1005 			else {
1006 				if (cl->leaf.q && !q->offload)
1007 					qdisc_reset(cl->leaf.q);
1008 			}
1009 			cl->prio_activity = 0;
1010 			cl->cmode = HTB_CAN_SEND;
1011 		}
1012 	}
1013 	qdisc_watchdog_cancel(&q->watchdog);
1014 	__qdisc_reset_queue(&q->direct_queue);
1015 	memset(q->hlevel, 0, sizeof(q->hlevel));
1016 	memset(q->row_mask, 0, sizeof(q->row_mask));
1017 }
1018 
1019 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1020 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1021 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1022 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1023 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1024 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1025 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1026 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1027 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1028 };
1029 
1030 static void htb_work_func(struct work_struct *work)
1031 {
1032 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1033 	struct Qdisc *sch = q->watchdog.qdisc;
1034 
1035 	rcu_read_lock();
1036 	__netif_schedule(qdisc_root(sch));
1037 	rcu_read_unlock();
1038 }
1039 
1040 static void htb_set_lockdep_class_child(struct Qdisc *q)
1041 {
1042 	static struct lock_class_key child_key;
1043 
1044 	lockdep_set_class(qdisc_lock(q), &child_key);
1045 }
1046 
1047 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1048 {
1049 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1050 }
1051 
1052 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1053 		    struct netlink_ext_ack *extack)
1054 {
1055 	struct net_device *dev = qdisc_dev(sch);
1056 	struct tc_htb_qopt_offload offload_opt;
1057 	struct htb_sched *q = qdisc_priv(sch);
1058 	struct nlattr *tb[TCA_HTB_MAX + 1];
1059 	struct tc_htb_glob *gopt;
1060 	unsigned int ntx;
1061 	bool offload;
1062 	int err;
1063 
1064 	qdisc_watchdog_init(&q->watchdog, sch);
1065 	INIT_WORK(&q->work, htb_work_func);
1066 
1067 	if (!opt)
1068 		return -EINVAL;
1069 
1070 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1071 	if (err)
1072 		return err;
1073 
1074 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1075 					  NULL);
1076 	if (err < 0)
1077 		return err;
1078 
1079 	if (!tb[TCA_HTB_INIT])
1080 		return -EINVAL;
1081 
1082 	gopt = nla_data(tb[TCA_HTB_INIT]);
1083 	if (gopt->version != HTB_VER >> 16)
1084 		return -EINVAL;
1085 
1086 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1087 
1088 	if (offload) {
1089 		if (sch->parent != TC_H_ROOT) {
1090 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1091 			return -EOPNOTSUPP;
1092 		}
1093 
1094 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1095 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1096 			return -EOPNOTSUPP;
1097 		}
1098 
1099 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1100 		q->direct_qdiscs = kcalloc(q->num_direct_qdiscs,
1101 					   sizeof(*q->direct_qdiscs),
1102 					   GFP_KERNEL);
1103 		if (!q->direct_qdiscs)
1104 			return -ENOMEM;
1105 	}
1106 
1107 	err = qdisc_class_hash_init(&q->clhash);
1108 	if (err < 0)
1109 		return err;
1110 
1111 	if (tb[TCA_HTB_DIRECT_QLEN])
1112 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1113 	else
1114 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1115 
1116 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1117 		q->rate2quantum = 1;
1118 	q->defcls = gopt->defcls;
1119 
1120 	if (!offload)
1121 		return 0;
1122 
1123 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1124 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1125 		struct Qdisc *qdisc;
1126 
1127 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1128 					  TC_H_MAKE(sch->handle, 0), extack);
1129 		if (!qdisc) {
1130 			return -ENOMEM;
1131 		}
1132 
1133 		htb_set_lockdep_class_child(qdisc);
1134 		q->direct_qdiscs[ntx] = qdisc;
1135 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1136 	}
1137 
1138 	sch->flags |= TCQ_F_MQROOT;
1139 
1140 	offload_opt = (struct tc_htb_qopt_offload) {
1141 		.command = TC_HTB_CREATE,
1142 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1143 		.classid = TC_H_MIN(q->defcls),
1144 		.extack = extack,
1145 	};
1146 	err = htb_offload(dev, &offload_opt);
1147 	if (err)
1148 		return err;
1149 
1150 	/* Defer this assignment, so that htb_destroy skips offload-related
1151 	 * parts (especially calling ndo_setup_tc) on errors.
1152 	 */
1153 	q->offload = true;
1154 
1155 	return 0;
1156 }
1157 
1158 static void htb_attach_offload(struct Qdisc *sch)
1159 {
1160 	struct net_device *dev = qdisc_dev(sch);
1161 	struct htb_sched *q = qdisc_priv(sch);
1162 	unsigned int ntx;
1163 
1164 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1165 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1166 
1167 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1168 		qdisc_put(old);
1169 		qdisc_hash_add(qdisc, false);
1170 	}
1171 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1172 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1173 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1174 
1175 		qdisc_put(old);
1176 	}
1177 
1178 	kfree(q->direct_qdiscs);
1179 	q->direct_qdiscs = NULL;
1180 }
1181 
1182 static void htb_attach_software(struct Qdisc *sch)
1183 {
1184 	struct net_device *dev = qdisc_dev(sch);
1185 	unsigned int ntx;
1186 
1187 	/* Resemble qdisc_graft behavior. */
1188 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1189 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1190 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1191 
1192 		qdisc_refcount_inc(sch);
1193 
1194 		qdisc_put(old);
1195 	}
1196 }
1197 
1198 static void htb_attach(struct Qdisc *sch)
1199 {
1200 	struct htb_sched *q = qdisc_priv(sch);
1201 
1202 	if (q->offload)
1203 		htb_attach_offload(sch);
1204 	else
1205 		htb_attach_software(sch);
1206 }
1207 
1208 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1209 {
1210 	struct htb_sched *q = qdisc_priv(sch);
1211 	struct nlattr *nest;
1212 	struct tc_htb_glob gopt;
1213 
1214 	if (q->offload)
1215 		sch->flags |= TCQ_F_OFFLOADED;
1216 	else
1217 		sch->flags &= ~TCQ_F_OFFLOADED;
1218 
1219 	sch->qstats.overlimits = q->overlimits;
1220 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1221 	 * no change can happen on the qdisc parameters.
1222 	 */
1223 
1224 	gopt.direct_pkts = q->direct_pkts;
1225 	gopt.version = HTB_VER;
1226 	gopt.rate2quantum = q->rate2quantum;
1227 	gopt.defcls = q->defcls;
1228 	gopt.debug = 0;
1229 
1230 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1231 	if (nest == NULL)
1232 		goto nla_put_failure;
1233 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1234 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1235 		goto nla_put_failure;
1236 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1237 		goto nla_put_failure;
1238 
1239 	return nla_nest_end(skb, nest);
1240 
1241 nla_put_failure:
1242 	nla_nest_cancel(skb, nest);
1243 	return -1;
1244 }
1245 
1246 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1247 			  struct sk_buff *skb, struct tcmsg *tcm)
1248 {
1249 	struct htb_class *cl = (struct htb_class *)arg;
1250 	struct htb_sched *q = qdisc_priv(sch);
1251 	struct nlattr *nest;
1252 	struct tc_htb_opt opt;
1253 
1254 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1255 	 * no change can happen on the class parameters.
1256 	 */
1257 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1258 	tcm->tcm_handle = cl->common.classid;
1259 	if (!cl->level && cl->leaf.q)
1260 		tcm->tcm_info = cl->leaf.q->handle;
1261 
1262 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1263 	if (nest == NULL)
1264 		goto nla_put_failure;
1265 
1266 	memset(&opt, 0, sizeof(opt));
1267 
1268 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1269 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1270 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1271 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1272 	opt.quantum = cl->quantum;
1273 	opt.prio = cl->prio;
1274 	opt.level = cl->level;
1275 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1276 		goto nla_put_failure;
1277 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1278 		goto nla_put_failure;
1279 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1280 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1281 			      TCA_HTB_PAD))
1282 		goto nla_put_failure;
1283 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1284 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1285 			      TCA_HTB_PAD))
1286 		goto nla_put_failure;
1287 
1288 	return nla_nest_end(skb, nest);
1289 
1290 nla_put_failure:
1291 	nla_nest_cancel(skb, nest);
1292 	return -1;
1293 }
1294 
1295 static void htb_offload_aggregate_stats(struct htb_sched *q,
1296 					struct htb_class *cl)
1297 {
1298 	u64 bytes = 0, packets = 0;
1299 	struct htb_class *c;
1300 	unsigned int i;
1301 
1302 	gnet_stats_basic_sync_init(&cl->bstats);
1303 
1304 	for (i = 0; i < q->clhash.hashsize; i++) {
1305 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1306 			struct htb_class *p = c;
1307 
1308 			while (p && p->level < cl->level)
1309 				p = p->parent;
1310 
1311 			if (p != cl)
1312 				continue;
1313 
1314 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1315 			packets += u64_stats_read(&c->bstats_bias.packets);
1316 			if (c->level == 0) {
1317 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1318 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1319 			}
1320 		}
1321 	}
1322 	_bstats_update(&cl->bstats, bytes, packets);
1323 }
1324 
1325 static int
1326 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1327 {
1328 	struct htb_class *cl = (struct htb_class *)arg;
1329 	struct htb_sched *q = qdisc_priv(sch);
1330 	struct gnet_stats_queue qs = {
1331 		.drops = cl->drops,
1332 		.overlimits = cl->overlimits,
1333 	};
1334 	__u32 qlen = 0;
1335 
1336 	if (!cl->level && cl->leaf.q)
1337 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1338 
1339 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1340 				    INT_MIN, INT_MAX);
1341 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1342 				     INT_MIN, INT_MAX);
1343 
1344 	if (q->offload) {
1345 		if (!cl->level) {
1346 			if (cl->leaf.q)
1347 				cl->bstats = cl->leaf.q->bstats;
1348 			else
1349 				gnet_stats_basic_sync_init(&cl->bstats);
1350 			_bstats_update(&cl->bstats,
1351 				       u64_stats_read(&cl->bstats_bias.bytes),
1352 				       u64_stats_read(&cl->bstats_bias.packets));
1353 		} else {
1354 			htb_offload_aggregate_stats(q, cl);
1355 		}
1356 	}
1357 
1358 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1359 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1360 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1361 		return -1;
1362 
1363 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1364 }
1365 
1366 static struct netdev_queue *
1367 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1368 {
1369 	struct net_device *dev = qdisc_dev(sch);
1370 	struct tc_htb_qopt_offload offload_opt;
1371 	struct htb_sched *q = qdisc_priv(sch);
1372 	int err;
1373 
1374 	if (!q->offload)
1375 		return sch->dev_queue;
1376 
1377 	offload_opt = (struct tc_htb_qopt_offload) {
1378 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1379 		.classid = TC_H_MIN(tcm->tcm_parent),
1380 	};
1381 	err = htb_offload(dev, &offload_opt);
1382 	if (err || offload_opt.qid >= dev->num_tx_queues)
1383 		return NULL;
1384 	return netdev_get_tx_queue(dev, offload_opt.qid);
1385 }
1386 
1387 static struct Qdisc *
1388 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1389 {
1390 	struct net_device *dev = dev_queue->dev;
1391 	struct Qdisc *old_q;
1392 
1393 	if (dev->flags & IFF_UP)
1394 		dev_deactivate(dev);
1395 	old_q = dev_graft_qdisc(dev_queue, new_q);
1396 	if (new_q)
1397 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1398 	if (dev->flags & IFF_UP)
1399 		dev_activate(dev);
1400 
1401 	return old_q;
1402 }
1403 
1404 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1405 {
1406 	struct netdev_queue *queue;
1407 
1408 	queue = cl->leaf.offload_queue;
1409 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1410 		WARN_ON(cl->leaf.q->dev_queue != queue);
1411 
1412 	return queue;
1413 }
1414 
1415 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1416 				   struct htb_class *cl_new, bool destroying)
1417 {
1418 	struct netdev_queue *queue_old, *queue_new;
1419 	struct net_device *dev = qdisc_dev(sch);
1420 
1421 	queue_old = htb_offload_get_queue(cl_old);
1422 	queue_new = htb_offload_get_queue(cl_new);
1423 
1424 	if (!destroying) {
1425 		struct Qdisc *qdisc;
1426 
1427 		if (dev->flags & IFF_UP)
1428 			dev_deactivate(dev);
1429 		qdisc = dev_graft_qdisc(queue_old, NULL);
1430 		WARN_ON(qdisc != cl_old->leaf.q);
1431 	}
1432 
1433 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1434 		cl_old->leaf.q->dev_queue = queue_new;
1435 	cl_old->leaf.offload_queue = queue_new;
1436 
1437 	if (!destroying) {
1438 		struct Qdisc *qdisc;
1439 
1440 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1441 		if (dev->flags & IFF_UP)
1442 			dev_activate(dev);
1443 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1444 	}
1445 }
1446 
1447 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1448 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1449 {
1450 	struct netdev_queue *dev_queue = sch->dev_queue;
1451 	struct htb_class *cl = (struct htb_class *)arg;
1452 	struct htb_sched *q = qdisc_priv(sch);
1453 	struct Qdisc *old_q;
1454 
1455 	if (cl->level)
1456 		return -EINVAL;
1457 
1458 	if (q->offload)
1459 		dev_queue = htb_offload_get_queue(cl);
1460 
1461 	if (!new) {
1462 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1463 					cl->common.classid, extack);
1464 		if (!new)
1465 			return -ENOBUFS;
1466 	}
1467 
1468 	if (q->offload) {
1469 		htb_set_lockdep_class_child(new);
1470 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1471 		qdisc_refcount_inc(new);
1472 		old_q = htb_graft_helper(dev_queue, new);
1473 	}
1474 
1475 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1476 
1477 	if (q->offload) {
1478 		WARN_ON(old_q != *old);
1479 		qdisc_put(old_q);
1480 	}
1481 
1482 	return 0;
1483 }
1484 
1485 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1486 {
1487 	struct htb_class *cl = (struct htb_class *)arg;
1488 	return !cl->level ? cl->leaf.q : NULL;
1489 }
1490 
1491 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1492 {
1493 	struct htb_class *cl = (struct htb_class *)arg;
1494 
1495 	htb_deactivate(qdisc_priv(sch), cl);
1496 }
1497 
1498 static inline int htb_parent_last_child(struct htb_class *cl)
1499 {
1500 	if (!cl->parent)
1501 		/* the root class */
1502 		return 0;
1503 	if (cl->parent->children > 1)
1504 		/* not the last child */
1505 		return 0;
1506 	return 1;
1507 }
1508 
1509 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1510 			       struct Qdisc *new_q)
1511 {
1512 	struct htb_sched *q = qdisc_priv(sch);
1513 	struct htb_class *parent = cl->parent;
1514 
1515 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1516 
1517 	if (parent->cmode != HTB_CAN_SEND)
1518 		htb_safe_rb_erase(&parent->pq_node,
1519 				  &q->hlevel[parent->level].wait_pq);
1520 
1521 	parent->level = 0;
1522 	memset(&parent->inner, 0, sizeof(parent->inner));
1523 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1524 	parent->tokens = parent->buffer;
1525 	parent->ctokens = parent->cbuffer;
1526 	parent->t_c = ktime_get_ns();
1527 	parent->cmode = HTB_CAN_SEND;
1528 	if (q->offload)
1529 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1530 }
1531 
1532 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1533 				       struct netdev_queue *dev_queue,
1534 				       struct Qdisc *new_q)
1535 {
1536 	struct Qdisc *old_q;
1537 
1538 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1539 	if (new_q)
1540 		qdisc_refcount_inc(new_q);
1541 	old_q = htb_graft_helper(dev_queue, new_q);
1542 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1543 }
1544 
1545 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1546 				     bool last_child, bool destroying,
1547 				     struct netlink_ext_ack *extack)
1548 {
1549 	struct tc_htb_qopt_offload offload_opt;
1550 	struct netdev_queue *dev_queue;
1551 	struct Qdisc *q = cl->leaf.q;
1552 	struct Qdisc *old;
1553 	int err;
1554 
1555 	if (cl->level)
1556 		return -EINVAL;
1557 
1558 	WARN_ON(!q);
1559 	dev_queue = htb_offload_get_queue(cl);
1560 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1561 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1562 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1563 	 */
1564 	if (!destroying) {
1565 		old = htb_graft_helper(dev_queue, NULL);
1566 		/* Last qdisc grafted should be the same as cl->leaf.q when
1567 		 * calling htb_delete.
1568 		 */
1569 		WARN_ON(old != q);
1570 	}
1571 
1572 	if (cl->parent) {
1573 		_bstats_update(&cl->parent->bstats_bias,
1574 			       u64_stats_read(&q->bstats.bytes),
1575 			       u64_stats_read(&q->bstats.packets));
1576 	}
1577 
1578 	offload_opt = (struct tc_htb_qopt_offload) {
1579 		.command = !last_child ? TC_HTB_LEAF_DEL :
1580 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1581 			   TC_HTB_LEAF_DEL_LAST,
1582 		.classid = cl->common.classid,
1583 		.extack = extack,
1584 	};
1585 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1586 
1587 	if (!destroying) {
1588 		if (!err)
1589 			qdisc_put(old);
1590 		else
1591 			htb_graft_helper(dev_queue, old);
1592 	}
1593 
1594 	if (last_child)
1595 		return err;
1596 
1597 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1598 		u32 classid = TC_H_MAJ(sch->handle) |
1599 			      TC_H_MIN(offload_opt.classid);
1600 		struct htb_class *moved_cl = htb_find(classid, sch);
1601 
1602 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1603 	}
1604 
1605 	return err;
1606 }
1607 
1608 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1609 {
1610 	if (!cl->level) {
1611 		WARN_ON(!cl->leaf.q);
1612 		qdisc_put(cl->leaf.q);
1613 	}
1614 	gen_kill_estimator(&cl->rate_est);
1615 	tcf_block_put(cl->block);
1616 	kfree(cl);
1617 }
1618 
1619 static void htb_destroy(struct Qdisc *sch)
1620 {
1621 	struct net_device *dev = qdisc_dev(sch);
1622 	struct tc_htb_qopt_offload offload_opt;
1623 	struct htb_sched *q = qdisc_priv(sch);
1624 	struct hlist_node *next;
1625 	bool nonempty, changed;
1626 	struct htb_class *cl;
1627 	unsigned int i;
1628 
1629 	cancel_work_sync(&q->work);
1630 	qdisc_watchdog_cancel(&q->watchdog);
1631 	/* This line used to be after htb_destroy_class call below
1632 	 * and surprisingly it worked in 2.4. But it must precede it
1633 	 * because filter need its target class alive to be able to call
1634 	 * unbind_filter on it (without Oops).
1635 	 */
1636 	tcf_block_put(q->block);
1637 
1638 	for (i = 0; i < q->clhash.hashsize; i++) {
1639 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1640 			tcf_block_put(cl->block);
1641 			cl->block = NULL;
1642 		}
1643 	}
1644 
1645 	do {
1646 		nonempty = false;
1647 		changed = false;
1648 		for (i = 0; i < q->clhash.hashsize; i++) {
1649 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1650 						  common.hnode) {
1651 				bool last_child;
1652 
1653 				if (!q->offload) {
1654 					htb_destroy_class(sch, cl);
1655 					continue;
1656 				}
1657 
1658 				nonempty = true;
1659 
1660 				if (cl->level)
1661 					continue;
1662 
1663 				changed = true;
1664 
1665 				last_child = htb_parent_last_child(cl);
1666 				htb_destroy_class_offload(sch, cl, last_child,
1667 							  true, NULL);
1668 				qdisc_class_hash_remove(&q->clhash,
1669 							&cl->common);
1670 				if (cl->parent)
1671 					cl->parent->children--;
1672 				if (last_child)
1673 					htb_parent_to_leaf(sch, cl, NULL);
1674 				htb_destroy_class(sch, cl);
1675 			}
1676 		}
1677 	} while (changed);
1678 	WARN_ON(nonempty);
1679 
1680 	qdisc_class_hash_destroy(&q->clhash);
1681 	__qdisc_reset_queue(&q->direct_queue);
1682 
1683 	if (q->offload) {
1684 		offload_opt = (struct tc_htb_qopt_offload) {
1685 			.command = TC_HTB_DESTROY,
1686 		};
1687 		htb_offload(dev, &offload_opt);
1688 	}
1689 
1690 	if (!q->direct_qdiscs)
1691 		return;
1692 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1693 		qdisc_put(q->direct_qdiscs[i]);
1694 	kfree(q->direct_qdiscs);
1695 }
1696 
1697 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1698 		      struct netlink_ext_ack *extack)
1699 {
1700 	struct htb_sched *q = qdisc_priv(sch);
1701 	struct htb_class *cl = (struct htb_class *)arg;
1702 	struct Qdisc *new_q = NULL;
1703 	int last_child = 0;
1704 	int err;
1705 
1706 	/* TODO: why don't allow to delete subtree ? references ? does
1707 	 * tc subsys guarantee us that in htb_destroy it holds no class
1708 	 * refs so that we can remove children safely there ?
1709 	 */
1710 	if (cl->children || cl->filter_cnt)
1711 		return -EBUSY;
1712 
1713 	if (!cl->level && htb_parent_last_child(cl))
1714 		last_child = 1;
1715 
1716 	if (q->offload) {
1717 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1718 						extack);
1719 		if (err)
1720 			return err;
1721 	}
1722 
1723 	if (last_child) {
1724 		struct netdev_queue *dev_queue = sch->dev_queue;
1725 
1726 		if (q->offload)
1727 			dev_queue = htb_offload_get_queue(cl);
1728 
1729 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1730 					  cl->parent->common.classid,
1731 					  NULL);
1732 		if (q->offload) {
1733 			if (new_q)
1734 				htb_set_lockdep_class_child(new_q);
1735 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1736 		}
1737 	}
1738 
1739 	sch_tree_lock(sch);
1740 
1741 	if (!cl->level)
1742 		qdisc_purge_queue(cl->leaf.q);
1743 
1744 	/* delete from hash and active; remainder in destroy_class */
1745 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1746 	if (cl->parent)
1747 		cl->parent->children--;
1748 
1749 	if (cl->prio_activity)
1750 		htb_deactivate(q, cl);
1751 
1752 	if (cl->cmode != HTB_CAN_SEND)
1753 		htb_safe_rb_erase(&cl->pq_node,
1754 				  &q->hlevel[cl->level].wait_pq);
1755 
1756 	if (last_child)
1757 		htb_parent_to_leaf(sch, cl, new_q);
1758 
1759 	sch_tree_unlock(sch);
1760 
1761 	htb_destroy_class(sch, cl);
1762 	return 0;
1763 }
1764 
1765 static int htb_change_class(struct Qdisc *sch, u32 classid,
1766 			    u32 parentid, struct nlattr **tca,
1767 			    unsigned long *arg, struct netlink_ext_ack *extack)
1768 {
1769 	int err = -EINVAL;
1770 	struct htb_sched *q = qdisc_priv(sch);
1771 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1772 	struct tc_htb_qopt_offload offload_opt;
1773 	struct nlattr *opt = tca[TCA_OPTIONS];
1774 	struct nlattr *tb[TCA_HTB_MAX + 1];
1775 	struct Qdisc *parent_qdisc = NULL;
1776 	struct netdev_queue *dev_queue;
1777 	struct tc_htb_opt *hopt;
1778 	u64 rate64, ceil64;
1779 	int warn = 0;
1780 
1781 	/* extract all subattrs from opt attr */
1782 	if (!opt)
1783 		goto failure;
1784 
1785 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1786 					  NULL);
1787 	if (err < 0)
1788 		goto failure;
1789 
1790 	err = -EINVAL;
1791 	if (tb[TCA_HTB_PARMS] == NULL)
1792 		goto failure;
1793 
1794 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1795 
1796 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1797 	if (!hopt->rate.rate || !hopt->ceil.rate)
1798 		goto failure;
1799 
1800 	if (q->offload) {
1801 		/* Options not supported by the offload. */
1802 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1803 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1804 			goto failure;
1805 		}
1806 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1807 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1808 			goto failure;
1809 		}
1810 		if (hopt->quantum) {
1811 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter");
1812 			goto failure;
1813 		}
1814 		if (hopt->prio) {
1815 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter");
1816 			goto failure;
1817 		}
1818 	}
1819 
1820 	/* Keeping backward compatible with rate_table based iproute2 tc */
1821 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1822 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1823 					      NULL));
1824 
1825 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1826 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1827 					      NULL));
1828 
1829 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1830 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1831 
1832 	if (!cl) {		/* new class */
1833 		struct net_device *dev = qdisc_dev(sch);
1834 		struct Qdisc *new_q, *old_q;
1835 		int prio;
1836 		struct {
1837 			struct nlattr		nla;
1838 			struct gnet_estimator	opt;
1839 		} est = {
1840 			.nla = {
1841 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1842 				.nla_type	= TCA_RATE,
1843 			},
1844 			.opt = {
1845 				/* 4s interval, 16s averaging constant */
1846 				.interval	= 2,
1847 				.ewma_log	= 2,
1848 			},
1849 		};
1850 
1851 		/* check for valid classid */
1852 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1853 		    htb_find(classid, sch))
1854 			goto failure;
1855 
1856 		/* check maximal depth */
1857 		if (parent && parent->parent && parent->parent->level < 2) {
1858 			pr_err("htb: tree is too deep\n");
1859 			goto failure;
1860 		}
1861 		err = -ENOBUFS;
1862 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1863 		if (!cl)
1864 			goto failure;
1865 
1866 		gnet_stats_basic_sync_init(&cl->bstats);
1867 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1868 
1869 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1870 		if (err) {
1871 			kfree(cl);
1872 			goto failure;
1873 		}
1874 		if (htb_rate_est || tca[TCA_RATE]) {
1875 			err = gen_new_estimator(&cl->bstats, NULL,
1876 						&cl->rate_est,
1877 						NULL,
1878 						true,
1879 						tca[TCA_RATE] ? : &est.nla);
1880 			if (err)
1881 				goto err_block_put;
1882 		}
1883 
1884 		cl->children = 0;
1885 		RB_CLEAR_NODE(&cl->pq_node);
1886 
1887 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1888 			RB_CLEAR_NODE(&cl->node[prio]);
1889 
1890 		cl->common.classid = classid;
1891 
1892 		/* Make sure nothing interrupts us in between of two
1893 		 * ndo_setup_tc calls.
1894 		 */
1895 		ASSERT_RTNL();
1896 
1897 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1898 		 * so that can't be used inside of sch_tree_lock
1899 		 * -- thanks to Karlis Peisenieks
1900 		 */
1901 		if (!q->offload) {
1902 			dev_queue = sch->dev_queue;
1903 		} else if (!(parent && !parent->level)) {
1904 			/* Assign a dev_queue to this classid. */
1905 			offload_opt = (struct tc_htb_qopt_offload) {
1906 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1907 				.classid = cl->common.classid,
1908 				.parent_classid = parent ?
1909 					TC_H_MIN(parent->common.classid) :
1910 					TC_HTB_CLASSID_ROOT,
1911 				.rate = max_t(u64, hopt->rate.rate, rate64),
1912 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1913 				.extack = extack,
1914 			};
1915 			err = htb_offload(dev, &offload_opt);
1916 			if (err) {
1917 				pr_err("htb: TC_HTB_LEAF_ALLOC_QUEUE failed with err = %d\n",
1918 				       err);
1919 				goto err_kill_estimator;
1920 			}
1921 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1922 		} else { /* First child. */
1923 			dev_queue = htb_offload_get_queue(parent);
1924 			old_q = htb_graft_helper(dev_queue, NULL);
1925 			WARN_ON(old_q != parent->leaf.q);
1926 			offload_opt = (struct tc_htb_qopt_offload) {
1927 				.command = TC_HTB_LEAF_TO_INNER,
1928 				.classid = cl->common.classid,
1929 				.parent_classid =
1930 					TC_H_MIN(parent->common.classid),
1931 				.rate = max_t(u64, hopt->rate.rate, rate64),
1932 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1933 				.extack = extack,
1934 			};
1935 			err = htb_offload(dev, &offload_opt);
1936 			if (err) {
1937 				pr_err("htb: TC_HTB_LEAF_TO_INNER failed with err = %d\n",
1938 				       err);
1939 				htb_graft_helper(dev_queue, old_q);
1940 				goto err_kill_estimator;
1941 			}
1942 			_bstats_update(&parent->bstats_bias,
1943 				       u64_stats_read(&old_q->bstats.bytes),
1944 				       u64_stats_read(&old_q->bstats.packets));
1945 			qdisc_put(old_q);
1946 		}
1947 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1948 					  classid, NULL);
1949 		if (q->offload) {
1950 			if (new_q) {
1951 				htb_set_lockdep_class_child(new_q);
1952 				/* One ref for cl->leaf.q, the other for
1953 				 * dev_queue->qdisc.
1954 				 */
1955 				qdisc_refcount_inc(new_q);
1956 			}
1957 			old_q = htb_graft_helper(dev_queue, new_q);
1958 			/* No qdisc_put needed. */
1959 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1960 		}
1961 		sch_tree_lock(sch);
1962 		if (parent && !parent->level) {
1963 			/* turn parent into inner node */
1964 			qdisc_purge_queue(parent->leaf.q);
1965 			parent_qdisc = parent->leaf.q;
1966 			if (parent->prio_activity)
1967 				htb_deactivate(q, parent);
1968 
1969 			/* remove from evt list because of level change */
1970 			if (parent->cmode != HTB_CAN_SEND) {
1971 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1972 				parent->cmode = HTB_CAN_SEND;
1973 			}
1974 			parent->level = (parent->parent ? parent->parent->level
1975 					 : TC_HTB_MAXDEPTH) - 1;
1976 			memset(&parent->inner, 0, sizeof(parent->inner));
1977 		}
1978 
1979 		/* leaf (we) needs elementary qdisc */
1980 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1981 		if (q->offload)
1982 			cl->leaf.offload_queue = dev_queue;
1983 
1984 		cl->parent = parent;
1985 
1986 		/* set class to be in HTB_CAN_SEND state */
1987 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1988 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1989 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1990 		cl->t_c = ktime_get_ns();
1991 		cl->cmode = HTB_CAN_SEND;
1992 
1993 		/* attach to the hash list and parent's family */
1994 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1995 		if (parent)
1996 			parent->children++;
1997 		if (cl->leaf.q != &noop_qdisc)
1998 			qdisc_hash_add(cl->leaf.q, true);
1999 	} else {
2000 		if (tca[TCA_RATE]) {
2001 			err = gen_replace_estimator(&cl->bstats, NULL,
2002 						    &cl->rate_est,
2003 						    NULL,
2004 						    true,
2005 						    tca[TCA_RATE]);
2006 			if (err)
2007 				return err;
2008 		}
2009 
2010 		if (q->offload) {
2011 			struct net_device *dev = qdisc_dev(sch);
2012 
2013 			offload_opt = (struct tc_htb_qopt_offload) {
2014 				.command = TC_HTB_NODE_MODIFY,
2015 				.classid = cl->common.classid,
2016 				.rate = max_t(u64, hopt->rate.rate, rate64),
2017 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2018 				.extack = extack,
2019 			};
2020 			err = htb_offload(dev, &offload_opt);
2021 			if (err)
2022 				/* Estimator was replaced, and rollback may fail
2023 				 * as well, so we don't try to recover it, and
2024 				 * the estimator won't work property with the
2025 				 * offload anyway, because bstats are updated
2026 				 * only when the stats are queried.
2027 				 */
2028 				return err;
2029 		}
2030 
2031 		sch_tree_lock(sch);
2032 	}
2033 
2034 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2035 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2036 
2037 	/* it used to be a nasty bug here, we have to check that node
2038 	 * is really leaf before changing cl->leaf !
2039 	 */
2040 	if (!cl->level) {
2041 		u64 quantum = cl->rate.rate_bytes_ps;
2042 
2043 		do_div(quantum, q->rate2quantum);
2044 		cl->quantum = min_t(u64, quantum, INT_MAX);
2045 
2046 		if (!hopt->quantum && cl->quantum < 1000) {
2047 			warn = -1;
2048 			cl->quantum = 1000;
2049 		}
2050 		if (!hopt->quantum && cl->quantum > 200000) {
2051 			warn = 1;
2052 			cl->quantum = 200000;
2053 		}
2054 		if (hopt->quantum)
2055 			cl->quantum = hopt->quantum;
2056 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2057 			cl->prio = TC_HTB_NUMPRIO - 1;
2058 	}
2059 
2060 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2061 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2062 
2063 	sch_tree_unlock(sch);
2064 	qdisc_put(parent_qdisc);
2065 
2066 	if (warn)
2067 		pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
2068 			    cl->common.classid, (warn == -1 ? "small" : "big"));
2069 
2070 	qdisc_class_hash_grow(sch, &q->clhash);
2071 
2072 	*arg = (unsigned long)cl;
2073 	return 0;
2074 
2075 err_kill_estimator:
2076 	gen_kill_estimator(&cl->rate_est);
2077 err_block_put:
2078 	tcf_block_put(cl->block);
2079 	kfree(cl);
2080 failure:
2081 	return err;
2082 }
2083 
2084 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2085 				       struct netlink_ext_ack *extack)
2086 {
2087 	struct htb_sched *q = qdisc_priv(sch);
2088 	struct htb_class *cl = (struct htb_class *)arg;
2089 
2090 	return cl ? cl->block : q->block;
2091 }
2092 
2093 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2094 				     u32 classid)
2095 {
2096 	struct htb_class *cl = htb_find(classid, sch);
2097 
2098 	/*if (cl && !cl->level) return 0;
2099 	 * The line above used to be there to prevent attaching filters to
2100 	 * leaves. But at least tc_index filter uses this just to get class
2101 	 * for other reasons so that we have to allow for it.
2102 	 * ----
2103 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2104 	 * another way to "lock" the class - unlike "get" this lock can
2105 	 * be broken by class during destroy IIUC.
2106 	 */
2107 	if (cl)
2108 		cl->filter_cnt++;
2109 	return (unsigned long)cl;
2110 }
2111 
2112 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2113 {
2114 	struct htb_class *cl = (struct htb_class *)arg;
2115 
2116 	if (cl)
2117 		cl->filter_cnt--;
2118 }
2119 
2120 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2121 {
2122 	struct htb_sched *q = qdisc_priv(sch);
2123 	struct htb_class *cl;
2124 	unsigned int i;
2125 
2126 	if (arg->stop)
2127 		return;
2128 
2129 	for (i = 0; i < q->clhash.hashsize; i++) {
2130 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2131 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2132 				return;
2133 		}
2134 	}
2135 }
2136 
2137 static const struct Qdisc_class_ops htb_class_ops = {
2138 	.select_queue	=	htb_select_queue,
2139 	.graft		=	htb_graft,
2140 	.leaf		=	htb_leaf,
2141 	.qlen_notify	=	htb_qlen_notify,
2142 	.find		=	htb_search,
2143 	.change		=	htb_change_class,
2144 	.delete		=	htb_delete,
2145 	.walk		=	htb_walk,
2146 	.tcf_block	=	htb_tcf_block,
2147 	.bind_tcf	=	htb_bind_filter,
2148 	.unbind_tcf	=	htb_unbind_filter,
2149 	.dump		=	htb_dump_class,
2150 	.dump_stats	=	htb_dump_class_stats,
2151 };
2152 
2153 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2154 	.cl_ops		=	&htb_class_ops,
2155 	.id		=	"htb",
2156 	.priv_size	=	sizeof(struct htb_sched),
2157 	.enqueue	=	htb_enqueue,
2158 	.dequeue	=	htb_dequeue,
2159 	.peek		=	qdisc_peek_dequeued,
2160 	.init		=	htb_init,
2161 	.attach		=	htb_attach,
2162 	.reset		=	htb_reset,
2163 	.destroy	=	htb_destroy,
2164 	.dump		=	htb_dump,
2165 	.owner		=	THIS_MODULE,
2166 };
2167 
2168 static int __init htb_module_init(void)
2169 {
2170 	return register_qdisc(&htb_qdisc_ops);
2171 }
2172 static void __exit htb_module_exit(void)
2173 {
2174 	unregister_qdisc(&htb_qdisc_ops);
2175 }
2176 
2177 module_init(htb_module_init)
2178 module_exit(htb_module_exit)
2179 MODULE_LICENSE("GPL");
2180