xref: /openbmc/linux/net/sched/sch_htb.c (revision f87deada)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 #include <net/pkt_cls.h>
44 
45 /* HTB algorithm.
46     Author: devik@cdi.cz
47     ========================================================================
48     HTB is like TBF with multiple classes. It is also similar to CBQ because
49     it allows to assign priority to each class in hierarchy.
50     In fact it is another implementation of Floyd's formal sharing.
51 
52     Levels:
53     Each class is assigned level. Leaf has ALWAYS level 0 and root
54     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
55     one less than their parent.
56 */
57 
58 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
59 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
60 
61 #if HTB_VER >> 16 != TC_HTB_PROTOVER
62 #error "Mismatched sch_htb.c and pkt_sch.h"
63 #endif
64 
65 /* Module parameter and sysfs export */
66 module_param    (htb_hysteresis, int, 0640);
67 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
68 
69 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
70 module_param(htb_rate_est, int, 0640);
71 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
72 
73 /* used internaly to keep status of single class */
74 enum htb_cmode {
75 	HTB_CANT_SEND,		/* class can't send and can't borrow */
76 	HTB_MAY_BORROW,		/* class can't send but may borrow */
77 	HTB_CAN_SEND		/* class can send */
78 };
79 
80 struct htb_prio {
81 	union {
82 		struct rb_root	row;
83 		struct rb_root	feed;
84 	};
85 	struct rb_node	*ptr;
86 	/* When class changes from state 1->2 and disconnects from
87 	 * parent's feed then we lost ptr value and start from the
88 	 * first child again. Here we store classid of the
89 	 * last valid ptr (used when ptr is NULL).
90 	 */
91 	u32		last_ptr_id;
92 };
93 
94 /* interior & leaf nodes; props specific to leaves are marked L:
95  * To reduce false sharing, place mostly read fields at beginning,
96  * and mostly written ones at the end.
97  */
98 struct htb_class {
99 	struct Qdisc_class_common common;
100 	struct psched_ratecfg	rate;
101 	struct psched_ratecfg	ceil;
102 	s64			buffer, cbuffer;/* token bucket depth/rate */
103 	s64			mbuffer;	/* max wait time */
104 	u32			prio;		/* these two are used only by leaves... */
105 	int			quantum;	/* but stored for parent-to-leaf return */
106 
107 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
108 	struct tcf_block	*block;
109 	int			filter_cnt;
110 
111 	int			level;		/* our level (see above) */
112 	unsigned int		children;
113 	struct htb_class	*parent;	/* parent class */
114 
115 	struct net_rate_estimator __rcu *rate_est;
116 
117 	/*
118 	 * Written often fields
119 	 */
120 	struct gnet_stats_basic_packed bstats;
121 	struct tc_htb_xstats	xstats;	/* our special stats */
122 
123 	/* token bucket parameters */
124 	s64			tokens, ctokens;/* current number of tokens */
125 	s64			t_c;		/* checkpoint time */
126 
127 	union {
128 		struct htb_class_leaf {
129 			struct list_head drop_list;
130 			int		deficit[TC_HTB_MAXDEPTH];
131 			struct Qdisc	*q;
132 		} leaf;
133 		struct htb_class_inner {
134 			struct htb_prio clprio[TC_HTB_NUMPRIO];
135 		} inner;
136 	} un;
137 	s64			pq_key;
138 
139 	int			prio_activity;	/* for which prios are we active */
140 	enum htb_cmode		cmode;		/* current mode of the class */
141 	struct rb_node		pq_node;	/* node for event queue */
142 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
143 
144 	unsigned int drops ____cacheline_aligned_in_smp;
145 	unsigned int		overlimits;
146 };
147 
148 struct htb_level {
149 	struct rb_root	wait_pq;
150 	struct htb_prio hprio[TC_HTB_NUMPRIO];
151 };
152 
153 struct htb_sched {
154 	struct Qdisc_class_hash clhash;
155 	int			defcls;		/* class where unclassified flows go to */
156 	int			rate2quantum;	/* quant = rate / rate2quantum */
157 
158 	/* filters for qdisc itself */
159 	struct tcf_proto __rcu	*filter_list;
160 	struct tcf_block	*block;
161 
162 #define HTB_WARN_TOOMANYEVENTS	0x1
163 	unsigned int		warned;	/* only one warning */
164 	int			direct_qlen;
165 	struct work_struct	work;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct qdisc_skb_head	direct_queue;
169 	long			direct_pkts;
170 
171 	struct qdisc_watchdog	watchdog;
172 
173 	s64			now;	/* cached dequeue time */
174 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
175 
176 	/* time of nearest event per level (row) */
177 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
178 
179 	int			row_mask[TC_HTB_MAXDEPTH];
180 
181 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
182 };
183 
184 /* find class in global hash table using given handle */
185 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
186 {
187 	struct htb_sched *q = qdisc_priv(sch);
188 	struct Qdisc_class_common *clc;
189 
190 	clc = qdisc_class_find(&q->clhash, handle);
191 	if (clc == NULL)
192 		return NULL;
193 	return container_of(clc, struct htb_class, common);
194 }
195 
196 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
197 {
198 	return (unsigned long)htb_find(handle, sch);
199 }
200 /**
201  * htb_classify - classify a packet into class
202  *
203  * It returns NULL if the packet should be dropped or -1 if the packet
204  * should be passed directly thru. In all other cases leaf class is returned.
205  * We allow direct class selection by classid in priority. The we examine
206  * filters in qdisc and in inner nodes (if higher filter points to the inner
207  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
208  * internal fifo (direct). These packets then go directly thru. If we still
209  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
210  * then finish and return direct queue.
211  */
212 #define HTB_DIRECT ((struct htb_class *)-1L)
213 
214 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
215 				      int *qerr)
216 {
217 	struct htb_sched *q = qdisc_priv(sch);
218 	struct htb_class *cl;
219 	struct tcf_result res;
220 	struct tcf_proto *tcf;
221 	int result;
222 
223 	/* allow to select class by setting skb->priority to valid classid;
224 	 * note that nfmark can be used too by attaching filter fw with no
225 	 * rules in it
226 	 */
227 	if (skb->priority == sch->handle)
228 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
229 	cl = htb_find(skb->priority, sch);
230 	if (cl) {
231 		if (cl->level == 0)
232 			return cl;
233 		/* Start with inner filter chain if a non-leaf class is selected */
234 		tcf = rcu_dereference_bh(cl->filter_list);
235 	} else {
236 		tcf = rcu_dereference_bh(q->filter_list);
237 	}
238 
239 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
240 	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
241 #ifdef CONFIG_NET_CLS_ACT
242 		switch (result) {
243 		case TC_ACT_QUEUED:
244 		case TC_ACT_STOLEN:
245 		case TC_ACT_TRAP:
246 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
247 			/* fall through */
248 		case TC_ACT_SHOT:
249 			return NULL;
250 		}
251 #endif
252 		cl = (void *)res.class;
253 		if (!cl) {
254 			if (res.classid == sch->handle)
255 				return HTB_DIRECT;	/* X:0 (direct flow) */
256 			cl = htb_find(res.classid, sch);
257 			if (!cl)
258 				break;	/* filter selected invalid classid */
259 		}
260 		if (!cl->level)
261 			return cl;	/* we hit leaf; return it */
262 
263 		/* we have got inner class; apply inner filter chain */
264 		tcf = rcu_dereference_bh(cl->filter_list);
265 	}
266 	/* classification failed; try to use default class */
267 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
268 	if (!cl || cl->level)
269 		return HTB_DIRECT;	/* bad default .. this is safe bet */
270 	return cl;
271 }
272 
273 /**
274  * htb_add_to_id_tree - adds class to the round robin list
275  *
276  * Routine adds class to the list (actually tree) sorted by classid.
277  * Make sure that class is not already on such list for given prio.
278  */
279 static void htb_add_to_id_tree(struct rb_root *root,
280 			       struct htb_class *cl, int prio)
281 {
282 	struct rb_node **p = &root->rb_node, *parent = NULL;
283 
284 	while (*p) {
285 		struct htb_class *c;
286 		parent = *p;
287 		c = rb_entry(parent, struct htb_class, node[prio]);
288 
289 		if (cl->common.classid > c->common.classid)
290 			p = &parent->rb_right;
291 		else
292 			p = &parent->rb_left;
293 	}
294 	rb_link_node(&cl->node[prio], parent, p);
295 	rb_insert_color(&cl->node[prio], root);
296 }
297 
298 /**
299  * htb_add_to_wait_tree - adds class to the event queue with delay
300  *
301  * The class is added to priority event queue to indicate that class will
302  * change its mode in cl->pq_key microseconds. Make sure that class is not
303  * already in the queue.
304  */
305 static void htb_add_to_wait_tree(struct htb_sched *q,
306 				 struct htb_class *cl, s64 delay)
307 {
308 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
309 
310 	cl->pq_key = q->now + delay;
311 	if (cl->pq_key == q->now)
312 		cl->pq_key++;
313 
314 	/* update the nearest event cache */
315 	if (q->near_ev_cache[cl->level] > cl->pq_key)
316 		q->near_ev_cache[cl->level] = cl->pq_key;
317 
318 	while (*p) {
319 		struct htb_class *c;
320 		parent = *p;
321 		c = rb_entry(parent, struct htb_class, pq_node);
322 		if (cl->pq_key >= c->pq_key)
323 			p = &parent->rb_right;
324 		else
325 			p = &parent->rb_left;
326 	}
327 	rb_link_node(&cl->pq_node, parent, p);
328 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
329 }
330 
331 /**
332  * htb_next_rb_node - finds next node in binary tree
333  *
334  * When we are past last key we return NULL.
335  * Average complexity is 2 steps per call.
336  */
337 static inline void htb_next_rb_node(struct rb_node **n)
338 {
339 	*n = rb_next(*n);
340 }
341 
342 /**
343  * htb_add_class_to_row - add class to its row
344  *
345  * The class is added to row at priorities marked in mask.
346  * It does nothing if mask == 0.
347  */
348 static inline void htb_add_class_to_row(struct htb_sched *q,
349 					struct htb_class *cl, int mask)
350 {
351 	q->row_mask[cl->level] |= mask;
352 	while (mask) {
353 		int prio = ffz(~mask);
354 		mask &= ~(1 << prio);
355 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
356 	}
357 }
358 
359 /* If this triggers, it is a bug in this code, but it need not be fatal */
360 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
361 {
362 	if (RB_EMPTY_NODE(rb)) {
363 		WARN_ON(1);
364 	} else {
365 		rb_erase(rb, root);
366 		RB_CLEAR_NODE(rb);
367 	}
368 }
369 
370 
371 /**
372  * htb_remove_class_from_row - removes class from its row
373  *
374  * The class is removed from row at priorities marked in mask.
375  * It does nothing if mask == 0.
376  */
377 static inline void htb_remove_class_from_row(struct htb_sched *q,
378 						 struct htb_class *cl, int mask)
379 {
380 	int m = 0;
381 	struct htb_level *hlevel = &q->hlevel[cl->level];
382 
383 	while (mask) {
384 		int prio = ffz(~mask);
385 		struct htb_prio *hprio = &hlevel->hprio[prio];
386 
387 		mask &= ~(1 << prio);
388 		if (hprio->ptr == cl->node + prio)
389 			htb_next_rb_node(&hprio->ptr);
390 
391 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
392 		if (!hprio->row.rb_node)
393 			m |= 1 << prio;
394 	}
395 	q->row_mask[cl->level] &= ~m;
396 }
397 
398 /**
399  * htb_activate_prios - creates active classe's feed chain
400  *
401  * The class is connected to ancestors and/or appropriate rows
402  * for priorities it is participating on. cl->cmode must be new
403  * (activated) mode. It does nothing if cl->prio_activity == 0.
404  */
405 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
406 {
407 	struct htb_class *p = cl->parent;
408 	long m, mask = cl->prio_activity;
409 
410 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
411 		m = mask;
412 		while (m) {
413 			int prio = ffz(~m);
414 			m &= ~(1 << prio);
415 
416 			if (p->un.inner.clprio[prio].feed.rb_node)
417 				/* parent already has its feed in use so that
418 				 * reset bit in mask as parent is already ok
419 				 */
420 				mask &= ~(1 << prio);
421 
422 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
423 		}
424 		p->prio_activity |= mask;
425 		cl = p;
426 		p = cl->parent;
427 
428 	}
429 	if (cl->cmode == HTB_CAN_SEND && mask)
430 		htb_add_class_to_row(q, cl, mask);
431 }
432 
433 /**
434  * htb_deactivate_prios - remove class from feed chain
435  *
436  * cl->cmode must represent old mode (before deactivation). It does
437  * nothing if cl->prio_activity == 0. Class is removed from all feed
438  * chains and rows.
439  */
440 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441 {
442 	struct htb_class *p = cl->parent;
443 	long m, mask = cl->prio_activity;
444 
445 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
446 		m = mask;
447 		mask = 0;
448 		while (m) {
449 			int prio = ffz(~m);
450 			m &= ~(1 << prio);
451 
452 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
453 				/* we are removing child which is pointed to from
454 				 * parent feed - forget the pointer but remember
455 				 * classid
456 				 */
457 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
458 				p->un.inner.clprio[prio].ptr = NULL;
459 			}
460 
461 			htb_safe_rb_erase(cl->node + prio,
462 					  &p->un.inner.clprio[prio].feed);
463 
464 			if (!p->un.inner.clprio[prio].feed.rb_node)
465 				mask |= 1 << prio;
466 		}
467 
468 		p->prio_activity &= ~mask;
469 		cl = p;
470 		p = cl->parent;
471 
472 	}
473 	if (cl->cmode == HTB_CAN_SEND && mask)
474 		htb_remove_class_from_row(q, cl, mask);
475 }
476 
477 static inline s64 htb_lowater(const struct htb_class *cl)
478 {
479 	if (htb_hysteresis)
480 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
481 	else
482 		return 0;
483 }
484 static inline s64 htb_hiwater(const struct htb_class *cl)
485 {
486 	if (htb_hysteresis)
487 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
488 	else
489 		return 0;
490 }
491 
492 
493 /**
494  * htb_class_mode - computes and returns current class mode
495  *
496  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
497  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
498  * from now to time when cl will change its state.
499  * Also it is worth to note that class mode doesn't change simply
500  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
501  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
502  * mode transitions per time unit. The speed gain is about 1/6.
503  */
504 static inline enum htb_cmode
505 htb_class_mode(struct htb_class *cl, s64 *diff)
506 {
507 	s64 toks;
508 
509 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
510 		*diff = -toks;
511 		return HTB_CANT_SEND;
512 	}
513 
514 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
515 		return HTB_CAN_SEND;
516 
517 	*diff = -toks;
518 	return HTB_MAY_BORROW;
519 }
520 
521 /**
522  * htb_change_class_mode - changes classe's mode
523  *
524  * This should be the only way how to change classe's mode under normal
525  * cirsumstances. Routine will update feed lists linkage, change mode
526  * and add class to the wait event queue if appropriate. New mode should
527  * be different from old one and cl->pq_key has to be valid if changing
528  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
529  */
530 static void
531 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
532 {
533 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
534 
535 	if (new_mode == cl->cmode)
536 		return;
537 
538 	if (new_mode == HTB_CANT_SEND)
539 		cl->overlimits++;
540 
541 	if (cl->prio_activity) {	/* not necessary: speed optimization */
542 		if (cl->cmode != HTB_CANT_SEND)
543 			htb_deactivate_prios(q, cl);
544 		cl->cmode = new_mode;
545 		if (new_mode != HTB_CANT_SEND)
546 			htb_activate_prios(q, cl);
547 	} else
548 		cl->cmode = new_mode;
549 }
550 
551 /**
552  * htb_activate - inserts leaf cl into appropriate active feeds
553  *
554  * Routine learns (new) priority of leaf and activates feed chain
555  * for the prio. It can be called on already active leaf safely.
556  * It also adds leaf into droplist.
557  */
558 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
559 {
560 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
561 
562 	if (!cl->prio_activity) {
563 		cl->prio_activity = 1 << cl->prio;
564 		htb_activate_prios(q, cl);
565 		list_add_tail(&cl->un.leaf.drop_list,
566 			      q->drops + cl->prio);
567 	}
568 }
569 
570 /**
571  * htb_deactivate - remove leaf cl from active feeds
572  *
573  * Make sure that leaf is active. In the other words it can't be called
574  * with non-active leaf. It also removes class from the drop list.
575  */
576 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
577 {
578 	WARN_ON(!cl->prio_activity);
579 
580 	htb_deactivate_prios(q, cl);
581 	cl->prio_activity = 0;
582 	list_del_init(&cl->un.leaf.drop_list);
583 }
584 
585 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
586 			     struct qdisc_skb_head *qh)
587 {
588 	struct sk_buff *last = qh->tail;
589 
590 	if (last) {
591 		skb->next = NULL;
592 		last->next = skb;
593 		qh->tail = skb;
594 	} else {
595 		qh->tail = skb;
596 		qh->head = skb;
597 	}
598 	qh->qlen++;
599 }
600 
601 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
602 		       struct sk_buff **to_free)
603 {
604 	int uninitialized_var(ret);
605 	struct htb_sched *q = qdisc_priv(sch);
606 	struct htb_class *cl = htb_classify(skb, sch, &ret);
607 
608 	if (cl == HTB_DIRECT) {
609 		/* enqueue to helper queue */
610 		if (q->direct_queue.qlen < q->direct_qlen) {
611 			htb_enqueue_tail(skb, sch, &q->direct_queue);
612 			q->direct_pkts++;
613 		} else {
614 			return qdisc_drop(skb, sch, to_free);
615 		}
616 #ifdef CONFIG_NET_CLS_ACT
617 	} else if (!cl) {
618 		if (ret & __NET_XMIT_BYPASS)
619 			qdisc_qstats_drop(sch);
620 		__qdisc_drop(skb, to_free);
621 		return ret;
622 #endif
623 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
624 					to_free)) != NET_XMIT_SUCCESS) {
625 		if (net_xmit_drop_count(ret)) {
626 			qdisc_qstats_drop(sch);
627 			cl->drops++;
628 		}
629 		return ret;
630 	} else {
631 		htb_activate(q, cl);
632 	}
633 
634 	qdisc_qstats_backlog_inc(sch, skb);
635 	sch->q.qlen++;
636 	return NET_XMIT_SUCCESS;
637 }
638 
639 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
640 {
641 	s64 toks = diff + cl->tokens;
642 
643 	if (toks > cl->buffer)
644 		toks = cl->buffer;
645 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
646 	if (toks <= -cl->mbuffer)
647 		toks = 1 - cl->mbuffer;
648 
649 	cl->tokens = toks;
650 }
651 
652 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
653 {
654 	s64 toks = diff + cl->ctokens;
655 
656 	if (toks > cl->cbuffer)
657 		toks = cl->cbuffer;
658 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
659 	if (toks <= -cl->mbuffer)
660 		toks = 1 - cl->mbuffer;
661 
662 	cl->ctokens = toks;
663 }
664 
665 /**
666  * htb_charge_class - charges amount "bytes" to leaf and ancestors
667  *
668  * Routine assumes that packet "bytes" long was dequeued from leaf cl
669  * borrowing from "level". It accounts bytes to ceil leaky bucket for
670  * leaf and all ancestors and to rate bucket for ancestors at levels
671  * "level" and higher. It also handles possible change of mode resulting
672  * from the update. Note that mode can also increase here (MAY_BORROW to
673  * CAN_SEND) because we can use more precise clock that event queue here.
674  * In such case we remove class from event queue first.
675  */
676 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
677 			     int level, struct sk_buff *skb)
678 {
679 	int bytes = qdisc_pkt_len(skb);
680 	enum htb_cmode old_mode;
681 	s64 diff;
682 
683 	while (cl) {
684 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
685 		if (cl->level >= level) {
686 			if (cl->level == level)
687 				cl->xstats.lends++;
688 			htb_accnt_tokens(cl, bytes, diff);
689 		} else {
690 			cl->xstats.borrows++;
691 			cl->tokens += diff;	/* we moved t_c; update tokens */
692 		}
693 		htb_accnt_ctokens(cl, bytes, diff);
694 		cl->t_c = q->now;
695 
696 		old_mode = cl->cmode;
697 		diff = 0;
698 		htb_change_class_mode(q, cl, &diff);
699 		if (old_mode != cl->cmode) {
700 			if (old_mode != HTB_CAN_SEND)
701 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
702 			if (cl->cmode != HTB_CAN_SEND)
703 				htb_add_to_wait_tree(q, cl, diff);
704 		}
705 
706 		/* update basic stats except for leaves which are already updated */
707 		if (cl->level)
708 			bstats_update(&cl->bstats, skb);
709 
710 		cl = cl->parent;
711 	}
712 }
713 
714 /**
715  * htb_do_events - make mode changes to classes at the level
716  *
717  * Scans event queue for pending events and applies them. Returns time of
718  * next pending event (0 for no event in pq, q->now for too many events).
719  * Note: Applied are events whose have cl->pq_key <= q->now.
720  */
721 static s64 htb_do_events(struct htb_sched *q, const int level,
722 			 unsigned long start)
723 {
724 	/* don't run for longer than 2 jiffies; 2 is used instead of
725 	 * 1 to simplify things when jiffy is going to be incremented
726 	 * too soon
727 	 */
728 	unsigned long stop_at = start + 2;
729 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
730 
731 	while (time_before(jiffies, stop_at)) {
732 		struct htb_class *cl;
733 		s64 diff;
734 		struct rb_node *p = rb_first(wait_pq);
735 
736 		if (!p)
737 			return 0;
738 
739 		cl = rb_entry(p, struct htb_class, pq_node);
740 		if (cl->pq_key > q->now)
741 			return cl->pq_key;
742 
743 		htb_safe_rb_erase(p, wait_pq);
744 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
745 		htb_change_class_mode(q, cl, &diff);
746 		if (cl->cmode != HTB_CAN_SEND)
747 			htb_add_to_wait_tree(q, cl, diff);
748 	}
749 
750 	/* too much load - let's continue after a break for scheduling */
751 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
752 		pr_warn("htb: too many events!\n");
753 		q->warned |= HTB_WARN_TOOMANYEVENTS;
754 	}
755 
756 	return q->now;
757 }
758 
759 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
760  * is no such one exists.
761  */
762 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
763 					      u32 id)
764 {
765 	struct rb_node *r = NULL;
766 	while (n) {
767 		struct htb_class *cl =
768 		    rb_entry(n, struct htb_class, node[prio]);
769 
770 		if (id > cl->common.classid) {
771 			n = n->rb_right;
772 		} else if (id < cl->common.classid) {
773 			r = n;
774 			n = n->rb_left;
775 		} else {
776 			return n;
777 		}
778 	}
779 	return r;
780 }
781 
782 /**
783  * htb_lookup_leaf - returns next leaf class in DRR order
784  *
785  * Find leaf where current feed pointers points to.
786  */
787 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
788 {
789 	int i;
790 	struct {
791 		struct rb_node *root;
792 		struct rb_node **pptr;
793 		u32 *pid;
794 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
795 
796 	BUG_ON(!hprio->row.rb_node);
797 	sp->root = hprio->row.rb_node;
798 	sp->pptr = &hprio->ptr;
799 	sp->pid = &hprio->last_ptr_id;
800 
801 	for (i = 0; i < 65535; i++) {
802 		if (!*sp->pptr && *sp->pid) {
803 			/* ptr was invalidated but id is valid - try to recover
804 			 * the original or next ptr
805 			 */
806 			*sp->pptr =
807 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
808 		}
809 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
810 				 * can become out of date quickly
811 				 */
812 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
813 			*sp->pptr = sp->root;
814 			while ((*sp->pptr)->rb_left)
815 				*sp->pptr = (*sp->pptr)->rb_left;
816 			if (sp > stk) {
817 				sp--;
818 				if (!*sp->pptr) {
819 					WARN_ON(1);
820 					return NULL;
821 				}
822 				htb_next_rb_node(sp->pptr);
823 			}
824 		} else {
825 			struct htb_class *cl;
826 			struct htb_prio *clp;
827 
828 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
829 			if (!cl->level)
830 				return cl;
831 			clp = &cl->un.inner.clprio[prio];
832 			(++sp)->root = clp->feed.rb_node;
833 			sp->pptr = &clp->ptr;
834 			sp->pid = &clp->last_ptr_id;
835 		}
836 	}
837 	WARN_ON(1);
838 	return NULL;
839 }
840 
841 /* dequeues packet at given priority and level; call only if
842  * you are sure that there is active class at prio/level
843  */
844 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
845 					const int level)
846 {
847 	struct sk_buff *skb = NULL;
848 	struct htb_class *cl, *start;
849 	struct htb_level *hlevel = &q->hlevel[level];
850 	struct htb_prio *hprio = &hlevel->hprio[prio];
851 
852 	/* look initial class up in the row */
853 	start = cl = htb_lookup_leaf(hprio, prio);
854 
855 	do {
856 next:
857 		if (unlikely(!cl))
858 			return NULL;
859 
860 		/* class can be empty - it is unlikely but can be true if leaf
861 		 * qdisc drops packets in enqueue routine or if someone used
862 		 * graft operation on the leaf since last dequeue;
863 		 * simply deactivate and skip such class
864 		 */
865 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
866 			struct htb_class *next;
867 			htb_deactivate(q, cl);
868 
869 			/* row/level might become empty */
870 			if ((q->row_mask[level] & (1 << prio)) == 0)
871 				return NULL;
872 
873 			next = htb_lookup_leaf(hprio, prio);
874 
875 			if (cl == start)	/* fix start if we just deleted it */
876 				start = next;
877 			cl = next;
878 			goto next;
879 		}
880 
881 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
882 		if (likely(skb != NULL))
883 			break;
884 
885 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
886 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
887 					 &q->hlevel[0].hprio[prio].ptr);
888 		cl = htb_lookup_leaf(hprio, prio);
889 
890 	} while (cl != start);
891 
892 	if (likely(skb != NULL)) {
893 		bstats_update(&cl->bstats, skb);
894 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
895 		if (cl->un.leaf.deficit[level] < 0) {
896 			cl->un.leaf.deficit[level] += cl->quantum;
897 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
898 						 &q->hlevel[0].hprio[prio].ptr);
899 		}
900 		/* this used to be after charge_class but this constelation
901 		 * gives us slightly better performance
902 		 */
903 		if (!cl->un.leaf.q->q.qlen)
904 			htb_deactivate(q, cl);
905 		htb_charge_class(q, cl, level, skb);
906 	}
907 	return skb;
908 }
909 
910 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
911 {
912 	struct sk_buff *skb;
913 	struct htb_sched *q = qdisc_priv(sch);
914 	int level;
915 	s64 next_event;
916 	unsigned long start_at;
917 
918 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
919 	skb = __qdisc_dequeue_head(&q->direct_queue);
920 	if (skb != NULL) {
921 ok:
922 		qdisc_bstats_update(sch, skb);
923 		qdisc_qstats_backlog_dec(sch, skb);
924 		sch->q.qlen--;
925 		return skb;
926 	}
927 
928 	if (!sch->q.qlen)
929 		goto fin;
930 	q->now = ktime_get_ns();
931 	start_at = jiffies;
932 
933 	next_event = q->now + 5LLU * NSEC_PER_SEC;
934 
935 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
936 		/* common case optimization - skip event handler quickly */
937 		int m;
938 		s64 event = q->near_ev_cache[level];
939 
940 		if (q->now >= event) {
941 			event = htb_do_events(q, level, start_at);
942 			if (!event)
943 				event = q->now + NSEC_PER_SEC;
944 			q->near_ev_cache[level] = event;
945 		}
946 
947 		if (next_event > event)
948 			next_event = event;
949 
950 		m = ~q->row_mask[level];
951 		while (m != (int)(-1)) {
952 			int prio = ffz(m);
953 
954 			m |= 1 << prio;
955 			skb = htb_dequeue_tree(q, prio, level);
956 			if (likely(skb != NULL))
957 				goto ok;
958 		}
959 	}
960 	qdisc_qstats_overlimit(sch);
961 	if (likely(next_event > q->now))
962 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
963 	else
964 		schedule_work(&q->work);
965 fin:
966 	return skb;
967 }
968 
969 /* reset all classes */
970 /* always caled under BH & queue lock */
971 static void htb_reset(struct Qdisc *sch)
972 {
973 	struct htb_sched *q = qdisc_priv(sch);
974 	struct htb_class *cl;
975 	unsigned int i;
976 
977 	for (i = 0; i < q->clhash.hashsize; i++) {
978 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
979 			if (cl->level)
980 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
981 			else {
982 				if (cl->un.leaf.q)
983 					qdisc_reset(cl->un.leaf.q);
984 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
985 			}
986 			cl->prio_activity = 0;
987 			cl->cmode = HTB_CAN_SEND;
988 		}
989 	}
990 	qdisc_watchdog_cancel(&q->watchdog);
991 	__qdisc_reset_queue(&q->direct_queue);
992 	sch->q.qlen = 0;
993 	sch->qstats.backlog = 0;
994 	memset(q->hlevel, 0, sizeof(q->hlevel));
995 	memset(q->row_mask, 0, sizeof(q->row_mask));
996 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
997 		INIT_LIST_HEAD(q->drops + i);
998 }
999 
1000 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1001 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1002 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1003 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1004 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1005 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1006 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1007 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1008 };
1009 
1010 static void htb_work_func(struct work_struct *work)
1011 {
1012 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1013 	struct Qdisc *sch = q->watchdog.qdisc;
1014 
1015 	rcu_read_lock();
1016 	__netif_schedule(qdisc_root(sch));
1017 	rcu_read_unlock();
1018 }
1019 
1020 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1021 		    struct netlink_ext_ack *extack)
1022 {
1023 	struct htb_sched *q = qdisc_priv(sch);
1024 	struct nlattr *tb[TCA_HTB_MAX + 1];
1025 	struct tc_htb_glob *gopt;
1026 	int err;
1027 	int i;
1028 
1029 	qdisc_watchdog_init(&q->watchdog, sch);
1030 	INIT_WORK(&q->work, htb_work_func);
1031 
1032 	if (!opt)
1033 		return -EINVAL;
1034 
1035 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1036 	if (err)
1037 		return err;
1038 
1039 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1040 	if (err < 0)
1041 		return err;
1042 
1043 	if (!tb[TCA_HTB_INIT])
1044 		return -EINVAL;
1045 
1046 	gopt = nla_data(tb[TCA_HTB_INIT]);
1047 	if (gopt->version != HTB_VER >> 16)
1048 		return -EINVAL;
1049 
1050 	err = qdisc_class_hash_init(&q->clhash);
1051 	if (err < 0)
1052 		return err;
1053 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1054 		INIT_LIST_HEAD(q->drops + i);
1055 
1056 	qdisc_skb_head_init(&q->direct_queue);
1057 
1058 	if (tb[TCA_HTB_DIRECT_QLEN])
1059 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1060 	else
1061 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1062 
1063 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1064 		q->rate2quantum = 1;
1065 	q->defcls = gopt->defcls;
1066 
1067 	return 0;
1068 }
1069 
1070 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1071 {
1072 	struct htb_sched *q = qdisc_priv(sch);
1073 	struct nlattr *nest;
1074 	struct tc_htb_glob gopt;
1075 
1076 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1077 	 * no change can happen on the qdisc parameters.
1078 	 */
1079 
1080 	gopt.direct_pkts = q->direct_pkts;
1081 	gopt.version = HTB_VER;
1082 	gopt.rate2quantum = q->rate2quantum;
1083 	gopt.defcls = q->defcls;
1084 	gopt.debug = 0;
1085 
1086 	nest = nla_nest_start(skb, TCA_OPTIONS);
1087 	if (nest == NULL)
1088 		goto nla_put_failure;
1089 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1090 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1091 		goto nla_put_failure;
1092 
1093 	return nla_nest_end(skb, nest);
1094 
1095 nla_put_failure:
1096 	nla_nest_cancel(skb, nest);
1097 	return -1;
1098 }
1099 
1100 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1101 			  struct sk_buff *skb, struct tcmsg *tcm)
1102 {
1103 	struct htb_class *cl = (struct htb_class *)arg;
1104 	struct nlattr *nest;
1105 	struct tc_htb_opt opt;
1106 
1107 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1108 	 * no change can happen on the class parameters.
1109 	 */
1110 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1111 	tcm->tcm_handle = cl->common.classid;
1112 	if (!cl->level && cl->un.leaf.q)
1113 		tcm->tcm_info = cl->un.leaf.q->handle;
1114 
1115 	nest = nla_nest_start(skb, TCA_OPTIONS);
1116 	if (nest == NULL)
1117 		goto nla_put_failure;
1118 
1119 	memset(&opt, 0, sizeof(opt));
1120 
1121 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1122 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1123 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1124 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1125 	opt.quantum = cl->quantum;
1126 	opt.prio = cl->prio;
1127 	opt.level = cl->level;
1128 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1129 		goto nla_put_failure;
1130 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1131 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1132 			      TCA_HTB_PAD))
1133 		goto nla_put_failure;
1134 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1135 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1136 			      TCA_HTB_PAD))
1137 		goto nla_put_failure;
1138 
1139 	return nla_nest_end(skb, nest);
1140 
1141 nla_put_failure:
1142 	nla_nest_cancel(skb, nest);
1143 	return -1;
1144 }
1145 
1146 static int
1147 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1148 {
1149 	struct htb_class *cl = (struct htb_class *)arg;
1150 	struct gnet_stats_queue qs = {
1151 		.drops = cl->drops,
1152 		.overlimits = cl->overlimits,
1153 	};
1154 	__u32 qlen = 0;
1155 
1156 	if (!cl->level && cl->un.leaf.q) {
1157 		qlen = cl->un.leaf.q->q.qlen;
1158 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1159 	}
1160 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1161 				    INT_MIN, INT_MAX);
1162 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1163 				     INT_MIN, INT_MAX);
1164 
1165 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1166 				  d, NULL, &cl->bstats) < 0 ||
1167 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1168 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1169 		return -1;
1170 
1171 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1172 }
1173 
1174 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1175 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1176 {
1177 	struct htb_class *cl = (struct htb_class *)arg;
1178 
1179 	if (cl->level)
1180 		return -EINVAL;
1181 	if (new == NULL &&
1182 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1183 				     cl->common.classid, extack)) == NULL)
1184 		return -ENOBUFS;
1185 
1186 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1187 	return 0;
1188 }
1189 
1190 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1191 {
1192 	struct htb_class *cl = (struct htb_class *)arg;
1193 	return !cl->level ? cl->un.leaf.q : NULL;
1194 }
1195 
1196 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1197 {
1198 	struct htb_class *cl = (struct htb_class *)arg;
1199 
1200 	htb_deactivate(qdisc_priv(sch), cl);
1201 }
1202 
1203 static inline int htb_parent_last_child(struct htb_class *cl)
1204 {
1205 	if (!cl->parent)
1206 		/* the root class */
1207 		return 0;
1208 	if (cl->parent->children > 1)
1209 		/* not the last child */
1210 		return 0;
1211 	return 1;
1212 }
1213 
1214 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1215 			       struct Qdisc *new_q)
1216 {
1217 	struct htb_class *parent = cl->parent;
1218 
1219 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1220 
1221 	if (parent->cmode != HTB_CAN_SEND)
1222 		htb_safe_rb_erase(&parent->pq_node,
1223 				  &q->hlevel[parent->level].wait_pq);
1224 
1225 	parent->level = 0;
1226 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1227 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1228 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1229 	parent->tokens = parent->buffer;
1230 	parent->ctokens = parent->cbuffer;
1231 	parent->t_c = ktime_get_ns();
1232 	parent->cmode = HTB_CAN_SEND;
1233 }
1234 
1235 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1236 {
1237 	if (!cl->level) {
1238 		WARN_ON(!cl->un.leaf.q);
1239 		qdisc_destroy(cl->un.leaf.q);
1240 	}
1241 	gen_kill_estimator(&cl->rate_est);
1242 	tcf_block_put(cl->block);
1243 	kfree(cl);
1244 }
1245 
1246 static void htb_destroy(struct Qdisc *sch)
1247 {
1248 	struct htb_sched *q = qdisc_priv(sch);
1249 	struct hlist_node *next;
1250 	struct htb_class *cl;
1251 	unsigned int i;
1252 
1253 	cancel_work_sync(&q->work);
1254 	qdisc_watchdog_cancel(&q->watchdog);
1255 	/* This line used to be after htb_destroy_class call below
1256 	 * and surprisingly it worked in 2.4. But it must precede it
1257 	 * because filter need its target class alive to be able to call
1258 	 * unbind_filter on it (without Oops).
1259 	 */
1260 	tcf_block_put(q->block);
1261 
1262 	for (i = 0; i < q->clhash.hashsize; i++) {
1263 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1264 			tcf_block_put(cl->block);
1265 			cl->block = NULL;
1266 		}
1267 	}
1268 	for (i = 0; i < q->clhash.hashsize; i++) {
1269 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1270 					  common.hnode)
1271 			htb_destroy_class(sch, cl);
1272 	}
1273 	qdisc_class_hash_destroy(&q->clhash);
1274 	__qdisc_reset_queue(&q->direct_queue);
1275 }
1276 
1277 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1278 {
1279 	struct htb_sched *q = qdisc_priv(sch);
1280 	struct htb_class *cl = (struct htb_class *)arg;
1281 	struct Qdisc *new_q = NULL;
1282 	int last_child = 0;
1283 
1284 	/* TODO: why don't allow to delete subtree ? references ? does
1285 	 * tc subsys guarantee us that in htb_destroy it holds no class
1286 	 * refs so that we can remove children safely there ?
1287 	 */
1288 	if (cl->children || cl->filter_cnt)
1289 		return -EBUSY;
1290 
1291 	if (!cl->level && htb_parent_last_child(cl)) {
1292 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1293 					  cl->parent->common.classid,
1294 					  NULL);
1295 		last_child = 1;
1296 	}
1297 
1298 	sch_tree_lock(sch);
1299 
1300 	if (!cl->level) {
1301 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1302 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1303 
1304 		qdisc_reset(cl->un.leaf.q);
1305 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1306 	}
1307 
1308 	/* delete from hash and active; remainder in destroy_class */
1309 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1310 	if (cl->parent)
1311 		cl->parent->children--;
1312 
1313 	if (cl->prio_activity)
1314 		htb_deactivate(q, cl);
1315 
1316 	if (cl->cmode != HTB_CAN_SEND)
1317 		htb_safe_rb_erase(&cl->pq_node,
1318 				  &q->hlevel[cl->level].wait_pq);
1319 
1320 	if (last_child)
1321 		htb_parent_to_leaf(q, cl, new_q);
1322 
1323 	sch_tree_unlock(sch);
1324 
1325 	htb_destroy_class(sch, cl);
1326 	return 0;
1327 }
1328 
1329 static int htb_change_class(struct Qdisc *sch, u32 classid,
1330 			    u32 parentid, struct nlattr **tca,
1331 			    unsigned long *arg, struct netlink_ext_ack *extack)
1332 {
1333 	int err = -EINVAL;
1334 	struct htb_sched *q = qdisc_priv(sch);
1335 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1336 	struct nlattr *opt = tca[TCA_OPTIONS];
1337 	struct nlattr *tb[TCA_HTB_MAX + 1];
1338 	struct tc_htb_opt *hopt;
1339 	u64 rate64, ceil64;
1340 
1341 	/* extract all subattrs from opt attr */
1342 	if (!opt)
1343 		goto failure;
1344 
1345 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1346 	if (err < 0)
1347 		goto failure;
1348 
1349 	err = -EINVAL;
1350 	if (tb[TCA_HTB_PARMS] == NULL)
1351 		goto failure;
1352 
1353 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1354 
1355 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1356 	if (!hopt->rate.rate || !hopt->ceil.rate)
1357 		goto failure;
1358 
1359 	/* Keeping backward compatible with rate_table based iproute2 tc */
1360 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1361 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1362 					      NULL));
1363 
1364 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1365 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1366 					      NULL));
1367 
1368 	if (!cl) {		/* new class */
1369 		struct Qdisc *new_q;
1370 		int prio;
1371 		struct {
1372 			struct nlattr		nla;
1373 			struct gnet_estimator	opt;
1374 		} est = {
1375 			.nla = {
1376 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1377 				.nla_type	= TCA_RATE,
1378 			},
1379 			.opt = {
1380 				/* 4s interval, 16s averaging constant */
1381 				.interval	= 2,
1382 				.ewma_log	= 2,
1383 			},
1384 		};
1385 
1386 		/* check for valid classid */
1387 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1388 		    htb_find(classid, sch))
1389 			goto failure;
1390 
1391 		/* check maximal depth */
1392 		if (parent && parent->parent && parent->parent->level < 2) {
1393 			pr_err("htb: tree is too deep\n");
1394 			goto failure;
1395 		}
1396 		err = -ENOBUFS;
1397 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1398 		if (!cl)
1399 			goto failure;
1400 
1401 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1402 		if (err) {
1403 			kfree(cl);
1404 			goto failure;
1405 		}
1406 		if (htb_rate_est || tca[TCA_RATE]) {
1407 			err = gen_new_estimator(&cl->bstats, NULL,
1408 						&cl->rate_est,
1409 						NULL,
1410 						qdisc_root_sleeping_running(sch),
1411 						tca[TCA_RATE] ? : &est.nla);
1412 			if (err) {
1413 				tcf_block_put(cl->block);
1414 				kfree(cl);
1415 				goto failure;
1416 			}
1417 		}
1418 
1419 		cl->children = 0;
1420 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1421 		RB_CLEAR_NODE(&cl->pq_node);
1422 
1423 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1424 			RB_CLEAR_NODE(&cl->node[prio]);
1425 
1426 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1427 		 * so that can't be used inside of sch_tree_lock
1428 		 * -- thanks to Karlis Peisenieks
1429 		 */
1430 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1431 					  classid, NULL);
1432 		sch_tree_lock(sch);
1433 		if (parent && !parent->level) {
1434 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1435 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1436 
1437 			/* turn parent into inner node */
1438 			qdisc_reset(parent->un.leaf.q);
1439 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1440 			qdisc_destroy(parent->un.leaf.q);
1441 			if (parent->prio_activity)
1442 				htb_deactivate(q, parent);
1443 
1444 			/* remove from evt list because of level change */
1445 			if (parent->cmode != HTB_CAN_SEND) {
1446 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1447 				parent->cmode = HTB_CAN_SEND;
1448 			}
1449 			parent->level = (parent->parent ? parent->parent->level
1450 					 : TC_HTB_MAXDEPTH) - 1;
1451 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1452 		}
1453 		/* leaf (we) needs elementary qdisc */
1454 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1455 
1456 		cl->common.classid = classid;
1457 		cl->parent = parent;
1458 
1459 		/* set class to be in HTB_CAN_SEND state */
1460 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1461 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1462 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1463 		cl->t_c = ktime_get_ns();
1464 		cl->cmode = HTB_CAN_SEND;
1465 
1466 		/* attach to the hash list and parent's family */
1467 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1468 		if (parent)
1469 			parent->children++;
1470 		if (cl->un.leaf.q != &noop_qdisc)
1471 			qdisc_hash_add(cl->un.leaf.q, true);
1472 	} else {
1473 		if (tca[TCA_RATE]) {
1474 			err = gen_replace_estimator(&cl->bstats, NULL,
1475 						    &cl->rate_est,
1476 						    NULL,
1477 						    qdisc_root_sleeping_running(sch),
1478 						    tca[TCA_RATE]);
1479 			if (err)
1480 				return err;
1481 		}
1482 		sch_tree_lock(sch);
1483 	}
1484 
1485 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1486 
1487 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1488 
1489 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1490 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1491 
1492 	/* it used to be a nasty bug here, we have to check that node
1493 	 * is really leaf before changing cl->un.leaf !
1494 	 */
1495 	if (!cl->level) {
1496 		u64 quantum = cl->rate.rate_bytes_ps;
1497 
1498 		do_div(quantum, q->rate2quantum);
1499 		cl->quantum = min_t(u64, quantum, INT_MAX);
1500 
1501 		if (!hopt->quantum && cl->quantum < 1000) {
1502 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1503 				cl->common.classid);
1504 			cl->quantum = 1000;
1505 		}
1506 		if (!hopt->quantum && cl->quantum > 200000) {
1507 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1508 				cl->common.classid);
1509 			cl->quantum = 200000;
1510 		}
1511 		if (hopt->quantum)
1512 			cl->quantum = hopt->quantum;
1513 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1514 			cl->prio = TC_HTB_NUMPRIO - 1;
1515 	}
1516 
1517 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1518 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1519 
1520 	sch_tree_unlock(sch);
1521 
1522 	qdisc_class_hash_grow(sch, &q->clhash);
1523 
1524 	*arg = (unsigned long)cl;
1525 	return 0;
1526 
1527 failure:
1528 	return err;
1529 }
1530 
1531 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
1532 				       struct netlink_ext_ack *extack)
1533 {
1534 	struct htb_sched *q = qdisc_priv(sch);
1535 	struct htb_class *cl = (struct htb_class *)arg;
1536 
1537 	return cl ? cl->block : q->block;
1538 }
1539 
1540 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1541 				     u32 classid)
1542 {
1543 	struct htb_class *cl = htb_find(classid, sch);
1544 
1545 	/*if (cl && !cl->level) return 0;
1546 	 * The line above used to be there to prevent attaching filters to
1547 	 * leaves. But at least tc_index filter uses this just to get class
1548 	 * for other reasons so that we have to allow for it.
1549 	 * ----
1550 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1551 	 * another way to "lock" the class - unlike "get" this lock can
1552 	 * be broken by class during destroy IIUC.
1553 	 */
1554 	if (cl)
1555 		cl->filter_cnt++;
1556 	return (unsigned long)cl;
1557 }
1558 
1559 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1560 {
1561 	struct htb_class *cl = (struct htb_class *)arg;
1562 
1563 	if (cl)
1564 		cl->filter_cnt--;
1565 }
1566 
1567 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1568 {
1569 	struct htb_sched *q = qdisc_priv(sch);
1570 	struct htb_class *cl;
1571 	unsigned int i;
1572 
1573 	if (arg->stop)
1574 		return;
1575 
1576 	for (i = 0; i < q->clhash.hashsize; i++) {
1577 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1578 			if (arg->count < arg->skip) {
1579 				arg->count++;
1580 				continue;
1581 			}
1582 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1583 				arg->stop = 1;
1584 				return;
1585 			}
1586 			arg->count++;
1587 		}
1588 	}
1589 }
1590 
1591 static const struct Qdisc_class_ops htb_class_ops = {
1592 	.graft		=	htb_graft,
1593 	.leaf		=	htb_leaf,
1594 	.qlen_notify	=	htb_qlen_notify,
1595 	.find		=	htb_search,
1596 	.change		=	htb_change_class,
1597 	.delete		=	htb_delete,
1598 	.walk		=	htb_walk,
1599 	.tcf_block	=	htb_tcf_block,
1600 	.bind_tcf	=	htb_bind_filter,
1601 	.unbind_tcf	=	htb_unbind_filter,
1602 	.dump		=	htb_dump_class,
1603 	.dump_stats	=	htb_dump_class_stats,
1604 };
1605 
1606 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1607 	.cl_ops		=	&htb_class_ops,
1608 	.id		=	"htb",
1609 	.priv_size	=	sizeof(struct htb_sched),
1610 	.enqueue	=	htb_enqueue,
1611 	.dequeue	=	htb_dequeue,
1612 	.peek		=	qdisc_peek_dequeued,
1613 	.init		=	htb_init,
1614 	.reset		=	htb_reset,
1615 	.destroy	=	htb_destroy,
1616 	.dump		=	htb_dump,
1617 	.owner		=	THIS_MODULE,
1618 };
1619 
1620 static int __init htb_module_init(void)
1621 {
1622 	return register_qdisc(&htb_qdisc_ops);
1623 }
1624 static void __exit htb_module_exit(void)
1625 {
1626 	unregister_qdisc(&htb_qdisc_ops);
1627 }
1628 
1629 module_init(htb_module_init)
1630 module_exit(htb_module_exit)
1631 MODULE_LICENSE("GPL");
1632