xref: /openbmc/linux/net/sched/sch_htb.c (revision c4c11dd1)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 
44 /* HTB algorithm.
45     Author: devik@cdi.cz
46     ========================================================================
47     HTB is like TBF with multiple classes. It is also similar to CBQ because
48     it allows to assign priority to each class in hierarchy.
49     In fact it is another implementation of Floyd's formal sharing.
50 
51     Levels:
52     Each class is assigned level. Leaf has ALWAYS level 0 and root
53     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
54     one less than their parent.
55 */
56 
57 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
58 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
59 
60 #if HTB_VER >> 16 != TC_HTB_PROTOVER
61 #error "Mismatched sch_htb.c and pkt_sch.h"
62 #endif
63 
64 /* Module parameter and sysfs export */
65 module_param    (htb_hysteresis, int, 0640);
66 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
67 
68 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
69 module_param(htb_rate_est, int, 0640);
70 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
71 
72 /* used internaly to keep status of single class */
73 enum htb_cmode {
74 	HTB_CANT_SEND,		/* class can't send and can't borrow */
75 	HTB_MAY_BORROW,		/* class can't send but may borrow */
76 	HTB_CAN_SEND		/* class can send */
77 };
78 
79 struct htb_prio {
80 	union {
81 		struct rb_root	row;
82 		struct rb_root	feed;
83 	};
84 	struct rb_node	*ptr;
85 	/* When class changes from state 1->2 and disconnects from
86 	 * parent's feed then we lost ptr value and start from the
87 	 * first child again. Here we store classid of the
88 	 * last valid ptr (used when ptr is NULL).
89 	 */
90 	u32		last_ptr_id;
91 };
92 
93 /* interior & leaf nodes; props specific to leaves are marked L:
94  * To reduce false sharing, place mostly read fields at beginning,
95  * and mostly written ones at the end.
96  */
97 struct htb_class {
98 	struct Qdisc_class_common common;
99 	struct psched_ratecfg	rate;
100 	struct psched_ratecfg	ceil;
101 	s64			buffer, cbuffer;/* token bucket depth/rate */
102 	s64			mbuffer;	/* max wait time */
103 	int			prio;		/* these two are used only by leaves... */
104 	int			quantum;	/* but stored for parent-to-leaf return */
105 
106 	struct tcf_proto	*filter_list;	/* class attached filters */
107 	int			filter_cnt;
108 	int			refcnt;		/* usage count of this class */
109 
110 	int			level;		/* our level (see above) */
111 	unsigned int		children;
112 	struct htb_class	*parent;	/* parent class */
113 
114 	struct gnet_stats_rate_est64 rate_est;
115 
116 	/*
117 	 * Written often fields
118 	 */
119 	struct gnet_stats_basic_packed bstats;
120 	struct gnet_stats_queue	qstats;
121 	struct tc_htb_xstats	xstats;	/* our special stats */
122 
123 	/* token bucket parameters */
124 	s64			tokens, ctokens;/* current number of tokens */
125 	s64			t_c;		/* checkpoint time */
126 
127 	union {
128 		struct htb_class_leaf {
129 			struct list_head drop_list;
130 			int		deficit[TC_HTB_MAXDEPTH];
131 			struct Qdisc	*q;
132 		} leaf;
133 		struct htb_class_inner {
134 			struct htb_prio clprio[TC_HTB_NUMPRIO];
135 		} inner;
136 	} un;
137 	s64			pq_key;
138 
139 	int			prio_activity;	/* for which prios are we active */
140 	enum htb_cmode		cmode;		/* current mode of the class */
141 	struct rb_node		pq_node;	/* node for event queue */
142 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto	*filter_list;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct sk_buff_head	direct_queue;
165 	long			direct_pkts;
166 
167 	struct qdisc_watchdog	watchdog;
168 
169 	s64			now;	/* cached dequeue time */
170 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 };
179 
180 /* find class in global hash table using given handle */
181 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
182 {
183 	struct htb_sched *q = qdisc_priv(sch);
184 	struct Qdisc_class_common *clc;
185 
186 	clc = qdisc_class_find(&q->clhash, handle);
187 	if (clc == NULL)
188 		return NULL;
189 	return container_of(clc, struct htb_class, common);
190 }
191 
192 /**
193  * htb_classify - classify a packet into class
194  *
195  * It returns NULL if the packet should be dropped or -1 if the packet
196  * should be passed directly thru. In all other cases leaf class is returned.
197  * We allow direct class selection by classid in priority. The we examine
198  * filters in qdisc and in inner nodes (if higher filter points to the inner
199  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
200  * internal fifo (direct). These packets then go directly thru. If we still
201  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
202  * then finish and return direct queue.
203  */
204 #define HTB_DIRECT ((struct htb_class *)-1L)
205 
206 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
207 				      int *qerr)
208 {
209 	struct htb_sched *q = qdisc_priv(sch);
210 	struct htb_class *cl;
211 	struct tcf_result res;
212 	struct tcf_proto *tcf;
213 	int result;
214 
215 	/* allow to select class by setting skb->priority to valid classid;
216 	 * note that nfmark can be used too by attaching filter fw with no
217 	 * rules in it
218 	 */
219 	if (skb->priority == sch->handle)
220 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
221 	cl = htb_find(skb->priority, sch);
222 	if (cl && cl->level == 0)
223 		return cl;
224 
225 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
226 	tcf = q->filter_list;
227 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
228 #ifdef CONFIG_NET_CLS_ACT
229 		switch (result) {
230 		case TC_ACT_QUEUED:
231 		case TC_ACT_STOLEN:
232 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
233 		case TC_ACT_SHOT:
234 			return NULL;
235 		}
236 #endif
237 		cl = (void *)res.class;
238 		if (!cl) {
239 			if (res.classid == sch->handle)
240 				return HTB_DIRECT;	/* X:0 (direct flow) */
241 			cl = htb_find(res.classid, sch);
242 			if (!cl)
243 				break;	/* filter selected invalid classid */
244 		}
245 		if (!cl->level)
246 			return cl;	/* we hit leaf; return it */
247 
248 		/* we have got inner class; apply inner filter chain */
249 		tcf = cl->filter_list;
250 	}
251 	/* classification failed; try to use default class */
252 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
253 	if (!cl || cl->level)
254 		return HTB_DIRECT;	/* bad default .. this is safe bet */
255 	return cl;
256 }
257 
258 /**
259  * htb_add_to_id_tree - adds class to the round robin list
260  *
261  * Routine adds class to the list (actually tree) sorted by classid.
262  * Make sure that class is not already on such list for given prio.
263  */
264 static void htb_add_to_id_tree(struct rb_root *root,
265 			       struct htb_class *cl, int prio)
266 {
267 	struct rb_node **p = &root->rb_node, *parent = NULL;
268 
269 	while (*p) {
270 		struct htb_class *c;
271 		parent = *p;
272 		c = rb_entry(parent, struct htb_class, node[prio]);
273 
274 		if (cl->common.classid > c->common.classid)
275 			p = &parent->rb_right;
276 		else
277 			p = &parent->rb_left;
278 	}
279 	rb_link_node(&cl->node[prio], parent, p);
280 	rb_insert_color(&cl->node[prio], root);
281 }
282 
283 /**
284  * htb_add_to_wait_tree - adds class to the event queue with delay
285  *
286  * The class is added to priority event queue to indicate that class will
287  * change its mode in cl->pq_key microseconds. Make sure that class is not
288  * already in the queue.
289  */
290 static void htb_add_to_wait_tree(struct htb_sched *q,
291 				 struct htb_class *cl, s64 delay)
292 {
293 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
294 
295 	cl->pq_key = q->now + delay;
296 	if (cl->pq_key == q->now)
297 		cl->pq_key++;
298 
299 	/* update the nearest event cache */
300 	if (q->near_ev_cache[cl->level] > cl->pq_key)
301 		q->near_ev_cache[cl->level] = cl->pq_key;
302 
303 	while (*p) {
304 		struct htb_class *c;
305 		parent = *p;
306 		c = rb_entry(parent, struct htb_class, pq_node);
307 		if (cl->pq_key >= c->pq_key)
308 			p = &parent->rb_right;
309 		else
310 			p = &parent->rb_left;
311 	}
312 	rb_link_node(&cl->pq_node, parent, p);
313 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
314 }
315 
316 /**
317  * htb_next_rb_node - finds next node in binary tree
318  *
319  * When we are past last key we return NULL.
320  * Average complexity is 2 steps per call.
321  */
322 static inline void htb_next_rb_node(struct rb_node **n)
323 {
324 	*n = rb_next(*n);
325 }
326 
327 /**
328  * htb_add_class_to_row - add class to its row
329  *
330  * The class is added to row at priorities marked in mask.
331  * It does nothing if mask == 0.
332  */
333 static inline void htb_add_class_to_row(struct htb_sched *q,
334 					struct htb_class *cl, int mask)
335 {
336 	q->row_mask[cl->level] |= mask;
337 	while (mask) {
338 		int prio = ffz(~mask);
339 		mask &= ~(1 << prio);
340 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
341 	}
342 }
343 
344 /* If this triggers, it is a bug in this code, but it need not be fatal */
345 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
346 {
347 	if (RB_EMPTY_NODE(rb)) {
348 		WARN_ON(1);
349 	} else {
350 		rb_erase(rb, root);
351 		RB_CLEAR_NODE(rb);
352 	}
353 }
354 
355 
356 /**
357  * htb_remove_class_from_row - removes class from its row
358  *
359  * The class is removed from row at priorities marked in mask.
360  * It does nothing if mask == 0.
361  */
362 static inline void htb_remove_class_from_row(struct htb_sched *q,
363 						 struct htb_class *cl, int mask)
364 {
365 	int m = 0;
366 	struct htb_level *hlevel = &q->hlevel[cl->level];
367 
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		struct htb_prio *hprio = &hlevel->hprio[prio];
371 
372 		mask &= ~(1 << prio);
373 		if (hprio->ptr == cl->node + prio)
374 			htb_next_rb_node(&hprio->ptr);
375 
376 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
377 		if (!hprio->row.rb_node)
378 			m |= 1 << prio;
379 	}
380 	q->row_mask[cl->level] &= ~m;
381 }
382 
383 /**
384  * htb_activate_prios - creates active classe's feed chain
385  *
386  * The class is connected to ancestors and/or appropriate rows
387  * for priorities it is participating on. cl->cmode must be new
388  * (activated) mode. It does nothing if cl->prio_activity == 0.
389  */
390 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
391 {
392 	struct htb_class *p = cl->parent;
393 	long m, mask = cl->prio_activity;
394 
395 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
396 		m = mask;
397 		while (m) {
398 			int prio = ffz(~m);
399 			m &= ~(1 << prio);
400 
401 			if (p->un.inner.clprio[prio].feed.rb_node)
402 				/* parent already has its feed in use so that
403 				 * reset bit in mask as parent is already ok
404 				 */
405 				mask &= ~(1 << prio);
406 
407 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
408 		}
409 		p->prio_activity |= mask;
410 		cl = p;
411 		p = cl->parent;
412 
413 	}
414 	if (cl->cmode == HTB_CAN_SEND && mask)
415 		htb_add_class_to_row(q, cl, mask);
416 }
417 
418 /**
419  * htb_deactivate_prios - remove class from feed chain
420  *
421  * cl->cmode must represent old mode (before deactivation). It does
422  * nothing if cl->prio_activity == 0. Class is removed from all feed
423  * chains and rows.
424  */
425 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
426 {
427 	struct htb_class *p = cl->parent;
428 	long m, mask = cl->prio_activity;
429 
430 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
431 		m = mask;
432 		mask = 0;
433 		while (m) {
434 			int prio = ffz(~m);
435 			m &= ~(1 << prio);
436 
437 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
438 				/* we are removing child which is pointed to from
439 				 * parent feed - forget the pointer but remember
440 				 * classid
441 				 */
442 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
443 				p->un.inner.clprio[prio].ptr = NULL;
444 			}
445 
446 			htb_safe_rb_erase(cl->node + prio,
447 					  &p->un.inner.clprio[prio].feed);
448 
449 			if (!p->un.inner.clprio[prio].feed.rb_node)
450 				mask |= 1 << prio;
451 		}
452 
453 		p->prio_activity &= ~mask;
454 		cl = p;
455 		p = cl->parent;
456 
457 	}
458 	if (cl->cmode == HTB_CAN_SEND && mask)
459 		htb_remove_class_from_row(q, cl, mask);
460 }
461 
462 static inline s64 htb_lowater(const struct htb_class *cl)
463 {
464 	if (htb_hysteresis)
465 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
466 	else
467 		return 0;
468 }
469 static inline s64 htb_hiwater(const struct htb_class *cl)
470 {
471 	if (htb_hysteresis)
472 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
473 	else
474 		return 0;
475 }
476 
477 
478 /**
479  * htb_class_mode - computes and returns current class mode
480  *
481  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
482  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
483  * from now to time when cl will change its state.
484  * Also it is worth to note that class mode doesn't change simply
485  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
486  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
487  * mode transitions per time unit. The speed gain is about 1/6.
488  */
489 static inline enum htb_cmode
490 htb_class_mode(struct htb_class *cl, s64 *diff)
491 {
492 	s64 toks;
493 
494 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
495 		*diff = -toks;
496 		return HTB_CANT_SEND;
497 	}
498 
499 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
500 		return HTB_CAN_SEND;
501 
502 	*diff = -toks;
503 	return HTB_MAY_BORROW;
504 }
505 
506 /**
507  * htb_change_class_mode - changes classe's mode
508  *
509  * This should be the only way how to change classe's mode under normal
510  * cirsumstances. Routine will update feed lists linkage, change mode
511  * and add class to the wait event queue if appropriate. New mode should
512  * be different from old one and cl->pq_key has to be valid if changing
513  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
514  */
515 static void
516 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
517 {
518 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
519 
520 	if (new_mode == cl->cmode)
521 		return;
522 
523 	if (cl->prio_activity) {	/* not necessary: speed optimization */
524 		if (cl->cmode != HTB_CANT_SEND)
525 			htb_deactivate_prios(q, cl);
526 		cl->cmode = new_mode;
527 		if (new_mode != HTB_CANT_SEND)
528 			htb_activate_prios(q, cl);
529 	} else
530 		cl->cmode = new_mode;
531 }
532 
533 /**
534  * htb_activate - inserts leaf cl into appropriate active feeds
535  *
536  * Routine learns (new) priority of leaf and activates feed chain
537  * for the prio. It can be called on already active leaf safely.
538  * It also adds leaf into droplist.
539  */
540 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
541 {
542 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
543 
544 	if (!cl->prio_activity) {
545 		cl->prio_activity = 1 << cl->prio;
546 		htb_activate_prios(q, cl);
547 		list_add_tail(&cl->un.leaf.drop_list,
548 			      q->drops + cl->prio);
549 	}
550 }
551 
552 /**
553  * htb_deactivate - remove leaf cl from active feeds
554  *
555  * Make sure that leaf is active. In the other words it can't be called
556  * with non-active leaf. It also removes class from the drop list.
557  */
558 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
559 {
560 	WARN_ON(!cl->prio_activity);
561 
562 	htb_deactivate_prios(q, cl);
563 	cl->prio_activity = 0;
564 	list_del_init(&cl->un.leaf.drop_list);
565 }
566 
567 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
568 {
569 	int uninitialized_var(ret);
570 	struct htb_sched *q = qdisc_priv(sch);
571 	struct htb_class *cl = htb_classify(skb, sch, &ret);
572 
573 	if (cl == HTB_DIRECT) {
574 		/* enqueue to helper queue */
575 		if (q->direct_queue.qlen < q->direct_qlen) {
576 			__skb_queue_tail(&q->direct_queue, skb);
577 			q->direct_pkts++;
578 		} else {
579 			return qdisc_drop(skb, sch);
580 		}
581 #ifdef CONFIG_NET_CLS_ACT
582 	} else if (!cl) {
583 		if (ret & __NET_XMIT_BYPASS)
584 			sch->qstats.drops++;
585 		kfree_skb(skb);
586 		return ret;
587 #endif
588 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
589 		if (net_xmit_drop_count(ret)) {
590 			sch->qstats.drops++;
591 			cl->qstats.drops++;
592 		}
593 		return ret;
594 	} else {
595 		htb_activate(q, cl);
596 	}
597 
598 	sch->q.qlen++;
599 	return NET_XMIT_SUCCESS;
600 }
601 
602 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
603 {
604 	s64 toks = diff + cl->tokens;
605 
606 	if (toks > cl->buffer)
607 		toks = cl->buffer;
608 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
609 	if (toks <= -cl->mbuffer)
610 		toks = 1 - cl->mbuffer;
611 
612 	cl->tokens = toks;
613 }
614 
615 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
616 {
617 	s64 toks = diff + cl->ctokens;
618 
619 	if (toks > cl->cbuffer)
620 		toks = cl->cbuffer;
621 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
622 	if (toks <= -cl->mbuffer)
623 		toks = 1 - cl->mbuffer;
624 
625 	cl->ctokens = toks;
626 }
627 
628 /**
629  * htb_charge_class - charges amount "bytes" to leaf and ancestors
630  *
631  * Routine assumes that packet "bytes" long was dequeued from leaf cl
632  * borrowing from "level". It accounts bytes to ceil leaky bucket for
633  * leaf and all ancestors and to rate bucket for ancestors at levels
634  * "level" and higher. It also handles possible change of mode resulting
635  * from the update. Note that mode can also increase here (MAY_BORROW to
636  * CAN_SEND) because we can use more precise clock that event queue here.
637  * In such case we remove class from event queue first.
638  */
639 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
640 			     int level, struct sk_buff *skb)
641 {
642 	int bytes = qdisc_pkt_len(skb);
643 	enum htb_cmode old_mode;
644 	s64 diff;
645 
646 	while (cl) {
647 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
648 		if (cl->level >= level) {
649 			if (cl->level == level)
650 				cl->xstats.lends++;
651 			htb_accnt_tokens(cl, bytes, diff);
652 		} else {
653 			cl->xstats.borrows++;
654 			cl->tokens += diff;	/* we moved t_c; update tokens */
655 		}
656 		htb_accnt_ctokens(cl, bytes, diff);
657 		cl->t_c = q->now;
658 
659 		old_mode = cl->cmode;
660 		diff = 0;
661 		htb_change_class_mode(q, cl, &diff);
662 		if (old_mode != cl->cmode) {
663 			if (old_mode != HTB_CAN_SEND)
664 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
665 			if (cl->cmode != HTB_CAN_SEND)
666 				htb_add_to_wait_tree(q, cl, diff);
667 		}
668 
669 		/* update basic stats except for leaves which are already updated */
670 		if (cl->level)
671 			bstats_update(&cl->bstats, skb);
672 
673 		cl = cl->parent;
674 	}
675 }
676 
677 /**
678  * htb_do_events - make mode changes to classes at the level
679  *
680  * Scans event queue for pending events and applies them. Returns time of
681  * next pending event (0 for no event in pq, q->now for too many events).
682  * Note: Applied are events whose have cl->pq_key <= q->now.
683  */
684 static s64 htb_do_events(struct htb_sched *q, const int level,
685 			 unsigned long start)
686 {
687 	/* don't run for longer than 2 jiffies; 2 is used instead of
688 	 * 1 to simplify things when jiffy is going to be incremented
689 	 * too soon
690 	 */
691 	unsigned long stop_at = start + 2;
692 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
693 
694 	while (time_before(jiffies, stop_at)) {
695 		struct htb_class *cl;
696 		s64 diff;
697 		struct rb_node *p = rb_first(wait_pq);
698 
699 		if (!p)
700 			return 0;
701 
702 		cl = rb_entry(p, struct htb_class, pq_node);
703 		if (cl->pq_key > q->now)
704 			return cl->pq_key;
705 
706 		htb_safe_rb_erase(p, wait_pq);
707 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
708 		htb_change_class_mode(q, cl, &diff);
709 		if (cl->cmode != HTB_CAN_SEND)
710 			htb_add_to_wait_tree(q, cl, diff);
711 	}
712 
713 	/* too much load - let's continue after a break for scheduling */
714 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
715 		pr_warning("htb: too many events!\n");
716 		q->warned |= HTB_WARN_TOOMANYEVENTS;
717 	}
718 
719 	return q->now;
720 }
721 
722 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
723  * is no such one exists.
724  */
725 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
726 					      u32 id)
727 {
728 	struct rb_node *r = NULL;
729 	while (n) {
730 		struct htb_class *cl =
731 		    rb_entry(n, struct htb_class, node[prio]);
732 
733 		if (id > cl->common.classid) {
734 			n = n->rb_right;
735 		} else if (id < cl->common.classid) {
736 			r = n;
737 			n = n->rb_left;
738 		} else {
739 			return n;
740 		}
741 	}
742 	return r;
743 }
744 
745 /**
746  * htb_lookup_leaf - returns next leaf class in DRR order
747  *
748  * Find leaf where current feed pointers points to.
749  */
750 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
751 {
752 	int i;
753 	struct {
754 		struct rb_node *root;
755 		struct rb_node **pptr;
756 		u32 *pid;
757 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
758 
759 	BUG_ON(!hprio->row.rb_node);
760 	sp->root = hprio->row.rb_node;
761 	sp->pptr = &hprio->ptr;
762 	sp->pid = &hprio->last_ptr_id;
763 
764 	for (i = 0; i < 65535; i++) {
765 		if (!*sp->pptr && *sp->pid) {
766 			/* ptr was invalidated but id is valid - try to recover
767 			 * the original or next ptr
768 			 */
769 			*sp->pptr =
770 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
771 		}
772 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
773 				 * can become out of date quickly
774 				 */
775 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
776 			*sp->pptr = sp->root;
777 			while ((*sp->pptr)->rb_left)
778 				*sp->pptr = (*sp->pptr)->rb_left;
779 			if (sp > stk) {
780 				sp--;
781 				if (!*sp->pptr) {
782 					WARN_ON(1);
783 					return NULL;
784 				}
785 				htb_next_rb_node(sp->pptr);
786 			}
787 		} else {
788 			struct htb_class *cl;
789 			struct htb_prio *clp;
790 
791 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
792 			if (!cl->level)
793 				return cl;
794 			clp = &cl->un.inner.clprio[prio];
795 			(++sp)->root = clp->feed.rb_node;
796 			sp->pptr = &clp->ptr;
797 			sp->pid = &clp->last_ptr_id;
798 		}
799 	}
800 	WARN_ON(1);
801 	return NULL;
802 }
803 
804 /* dequeues packet at given priority and level; call only if
805  * you are sure that there is active class at prio/level
806  */
807 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
808 					const int level)
809 {
810 	struct sk_buff *skb = NULL;
811 	struct htb_class *cl, *start;
812 	struct htb_level *hlevel = &q->hlevel[level];
813 	struct htb_prio *hprio = &hlevel->hprio[prio];
814 
815 	/* look initial class up in the row */
816 	start = cl = htb_lookup_leaf(hprio, prio);
817 
818 	do {
819 next:
820 		if (unlikely(!cl))
821 			return NULL;
822 
823 		/* class can be empty - it is unlikely but can be true if leaf
824 		 * qdisc drops packets in enqueue routine or if someone used
825 		 * graft operation on the leaf since last dequeue;
826 		 * simply deactivate and skip such class
827 		 */
828 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
829 			struct htb_class *next;
830 			htb_deactivate(q, cl);
831 
832 			/* row/level might become empty */
833 			if ((q->row_mask[level] & (1 << prio)) == 0)
834 				return NULL;
835 
836 			next = htb_lookup_leaf(hprio, prio);
837 
838 			if (cl == start)	/* fix start if we just deleted it */
839 				start = next;
840 			cl = next;
841 			goto next;
842 		}
843 
844 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
845 		if (likely(skb != NULL))
846 			break;
847 
848 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
849 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
850 					 &q->hlevel[0].hprio[prio].ptr);
851 		cl = htb_lookup_leaf(hprio, prio);
852 
853 	} while (cl != start);
854 
855 	if (likely(skb != NULL)) {
856 		bstats_update(&cl->bstats, skb);
857 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
858 		if (cl->un.leaf.deficit[level] < 0) {
859 			cl->un.leaf.deficit[level] += cl->quantum;
860 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
861 						 &q->hlevel[0].hprio[prio].ptr);
862 		}
863 		/* this used to be after charge_class but this constelation
864 		 * gives us slightly better performance
865 		 */
866 		if (!cl->un.leaf.q->q.qlen)
867 			htb_deactivate(q, cl);
868 		htb_charge_class(q, cl, level, skb);
869 	}
870 	return skb;
871 }
872 
873 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
874 {
875 	struct sk_buff *skb;
876 	struct htb_sched *q = qdisc_priv(sch);
877 	int level;
878 	s64 next_event;
879 	unsigned long start_at;
880 
881 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
882 	skb = __skb_dequeue(&q->direct_queue);
883 	if (skb != NULL) {
884 ok:
885 		qdisc_bstats_update(sch, skb);
886 		qdisc_unthrottled(sch);
887 		sch->q.qlen--;
888 		return skb;
889 	}
890 
891 	if (!sch->q.qlen)
892 		goto fin;
893 	q->now = ktime_to_ns(ktime_get());
894 	start_at = jiffies;
895 
896 	next_event = q->now + 5LLU * NSEC_PER_SEC;
897 
898 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
899 		/* common case optimization - skip event handler quickly */
900 		int m;
901 		s64 event = q->near_ev_cache[level];
902 
903 		if (q->now >= event) {
904 			event = htb_do_events(q, level, start_at);
905 			if (!event)
906 				event = q->now + NSEC_PER_SEC;
907 			q->near_ev_cache[level] = event;
908 		}
909 
910 		if (next_event > event)
911 			next_event = event;
912 
913 		m = ~q->row_mask[level];
914 		while (m != (int)(-1)) {
915 			int prio = ffz(m);
916 
917 			m |= 1 << prio;
918 			skb = htb_dequeue_tree(q, prio, level);
919 			if (likely(skb != NULL))
920 				goto ok;
921 		}
922 	}
923 	sch->qstats.overlimits++;
924 	if (likely(next_event > q->now)) {
925 		if (!test_bit(__QDISC_STATE_DEACTIVATED,
926 			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
927 			ktime_t time = ns_to_ktime(next_event);
928 			qdisc_throttled(q->watchdog.qdisc);
929 			hrtimer_start(&q->watchdog.timer, time,
930 				      HRTIMER_MODE_ABS);
931 		}
932 	} else {
933 		schedule_work(&q->work);
934 	}
935 fin:
936 	return skb;
937 }
938 
939 /* try to drop from each class (by prio) until one succeed */
940 static unsigned int htb_drop(struct Qdisc *sch)
941 {
942 	struct htb_sched *q = qdisc_priv(sch);
943 	int prio;
944 
945 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
946 		struct list_head *p;
947 		list_for_each(p, q->drops + prio) {
948 			struct htb_class *cl = list_entry(p, struct htb_class,
949 							  un.leaf.drop_list);
950 			unsigned int len;
951 			if (cl->un.leaf.q->ops->drop &&
952 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
953 				sch->q.qlen--;
954 				if (!cl->un.leaf.q->q.qlen)
955 					htb_deactivate(q, cl);
956 				return len;
957 			}
958 		}
959 	}
960 	return 0;
961 }
962 
963 /* reset all classes */
964 /* always caled under BH & queue lock */
965 static void htb_reset(struct Qdisc *sch)
966 {
967 	struct htb_sched *q = qdisc_priv(sch);
968 	struct htb_class *cl;
969 	unsigned int i;
970 
971 	for (i = 0; i < q->clhash.hashsize; i++) {
972 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
973 			if (cl->level)
974 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
975 			else {
976 				if (cl->un.leaf.q)
977 					qdisc_reset(cl->un.leaf.q);
978 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
979 			}
980 			cl->prio_activity = 0;
981 			cl->cmode = HTB_CAN_SEND;
982 
983 		}
984 	}
985 	qdisc_watchdog_cancel(&q->watchdog);
986 	__skb_queue_purge(&q->direct_queue);
987 	sch->q.qlen = 0;
988 	memset(q->hlevel, 0, sizeof(q->hlevel));
989 	memset(q->row_mask, 0, sizeof(q->row_mask));
990 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
991 		INIT_LIST_HEAD(q->drops + i);
992 }
993 
994 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
995 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
996 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
997 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
998 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
999 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1000 };
1001 
1002 static void htb_work_func(struct work_struct *work)
1003 {
1004 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1005 	struct Qdisc *sch = q->watchdog.qdisc;
1006 
1007 	__netif_schedule(qdisc_root(sch));
1008 }
1009 
1010 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1011 {
1012 	struct htb_sched *q = qdisc_priv(sch);
1013 	struct nlattr *tb[TCA_HTB_MAX + 1];
1014 	struct tc_htb_glob *gopt;
1015 	int err;
1016 	int i;
1017 
1018 	if (!opt)
1019 		return -EINVAL;
1020 
1021 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1022 	if (err < 0)
1023 		return err;
1024 
1025 	if (!tb[TCA_HTB_INIT])
1026 		return -EINVAL;
1027 
1028 	gopt = nla_data(tb[TCA_HTB_INIT]);
1029 	if (gopt->version != HTB_VER >> 16)
1030 		return -EINVAL;
1031 
1032 	err = qdisc_class_hash_init(&q->clhash);
1033 	if (err < 0)
1034 		return err;
1035 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1036 		INIT_LIST_HEAD(q->drops + i);
1037 
1038 	qdisc_watchdog_init(&q->watchdog, sch);
1039 	INIT_WORK(&q->work, htb_work_func);
1040 	skb_queue_head_init(&q->direct_queue);
1041 
1042 	if (tb[TCA_HTB_DIRECT_QLEN])
1043 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1044 	else {
1045 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1046 		if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1047 			q->direct_qlen = 2;
1048 	}
1049 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1050 		q->rate2quantum = 1;
1051 	q->defcls = gopt->defcls;
1052 
1053 	return 0;
1054 }
1055 
1056 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1057 {
1058 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1059 	struct htb_sched *q = qdisc_priv(sch);
1060 	struct nlattr *nest;
1061 	struct tc_htb_glob gopt;
1062 
1063 	spin_lock_bh(root_lock);
1064 
1065 	gopt.direct_pkts = q->direct_pkts;
1066 	gopt.version = HTB_VER;
1067 	gopt.rate2quantum = q->rate2quantum;
1068 	gopt.defcls = q->defcls;
1069 	gopt.debug = 0;
1070 
1071 	nest = nla_nest_start(skb, TCA_OPTIONS);
1072 	if (nest == NULL)
1073 		goto nla_put_failure;
1074 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1075 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1076 		goto nla_put_failure;
1077 	nla_nest_end(skb, nest);
1078 
1079 	spin_unlock_bh(root_lock);
1080 	return skb->len;
1081 
1082 nla_put_failure:
1083 	spin_unlock_bh(root_lock);
1084 	nla_nest_cancel(skb, nest);
1085 	return -1;
1086 }
1087 
1088 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1089 			  struct sk_buff *skb, struct tcmsg *tcm)
1090 {
1091 	struct htb_class *cl = (struct htb_class *)arg;
1092 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1093 	struct nlattr *nest;
1094 	struct tc_htb_opt opt;
1095 
1096 	spin_lock_bh(root_lock);
1097 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1098 	tcm->tcm_handle = cl->common.classid;
1099 	if (!cl->level && cl->un.leaf.q)
1100 		tcm->tcm_info = cl->un.leaf.q->handle;
1101 
1102 	nest = nla_nest_start(skb, TCA_OPTIONS);
1103 	if (nest == NULL)
1104 		goto nla_put_failure;
1105 
1106 	memset(&opt, 0, sizeof(opt));
1107 
1108 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1109 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1110 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1111 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1112 	opt.quantum = cl->quantum;
1113 	opt.prio = cl->prio;
1114 	opt.level = cl->level;
1115 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1116 		goto nla_put_failure;
1117 
1118 	nla_nest_end(skb, nest);
1119 	spin_unlock_bh(root_lock);
1120 	return skb->len;
1121 
1122 nla_put_failure:
1123 	spin_unlock_bh(root_lock);
1124 	nla_nest_cancel(skb, nest);
1125 	return -1;
1126 }
1127 
1128 static int
1129 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1130 {
1131 	struct htb_class *cl = (struct htb_class *)arg;
1132 
1133 	if (!cl->level && cl->un.leaf.q)
1134 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1135 	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
1136 	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
1137 
1138 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1139 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1140 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1141 		return -1;
1142 
1143 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1144 }
1145 
1146 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1147 		     struct Qdisc **old)
1148 {
1149 	struct htb_class *cl = (struct htb_class *)arg;
1150 
1151 	if (cl->level)
1152 		return -EINVAL;
1153 	if (new == NULL &&
1154 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1155 				     cl->common.classid)) == NULL)
1156 		return -ENOBUFS;
1157 
1158 	sch_tree_lock(sch);
1159 	*old = cl->un.leaf.q;
1160 	cl->un.leaf.q = new;
1161 	if (*old != NULL) {
1162 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1163 		qdisc_reset(*old);
1164 	}
1165 	sch_tree_unlock(sch);
1166 	return 0;
1167 }
1168 
1169 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1170 {
1171 	struct htb_class *cl = (struct htb_class *)arg;
1172 	return !cl->level ? cl->un.leaf.q : NULL;
1173 }
1174 
1175 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1176 {
1177 	struct htb_class *cl = (struct htb_class *)arg;
1178 
1179 	if (cl->un.leaf.q->q.qlen == 0)
1180 		htb_deactivate(qdisc_priv(sch), cl);
1181 }
1182 
1183 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1184 {
1185 	struct htb_class *cl = htb_find(classid, sch);
1186 	if (cl)
1187 		cl->refcnt++;
1188 	return (unsigned long)cl;
1189 }
1190 
1191 static inline int htb_parent_last_child(struct htb_class *cl)
1192 {
1193 	if (!cl->parent)
1194 		/* the root class */
1195 		return 0;
1196 	if (cl->parent->children > 1)
1197 		/* not the last child */
1198 		return 0;
1199 	return 1;
1200 }
1201 
1202 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1203 			       struct Qdisc *new_q)
1204 {
1205 	struct htb_class *parent = cl->parent;
1206 
1207 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1208 
1209 	if (parent->cmode != HTB_CAN_SEND)
1210 		htb_safe_rb_erase(&parent->pq_node,
1211 				  &q->hlevel[parent->level].wait_pq);
1212 
1213 	parent->level = 0;
1214 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1215 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1216 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1217 	parent->tokens = parent->buffer;
1218 	parent->ctokens = parent->cbuffer;
1219 	parent->t_c = ktime_to_ns(ktime_get());
1220 	parent->cmode = HTB_CAN_SEND;
1221 }
1222 
1223 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1224 {
1225 	if (!cl->level) {
1226 		WARN_ON(!cl->un.leaf.q);
1227 		qdisc_destroy(cl->un.leaf.q);
1228 	}
1229 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1230 	tcf_destroy_chain(&cl->filter_list);
1231 	kfree(cl);
1232 }
1233 
1234 static void htb_destroy(struct Qdisc *sch)
1235 {
1236 	struct htb_sched *q = qdisc_priv(sch);
1237 	struct hlist_node *next;
1238 	struct htb_class *cl;
1239 	unsigned int i;
1240 
1241 	cancel_work_sync(&q->work);
1242 	qdisc_watchdog_cancel(&q->watchdog);
1243 	/* This line used to be after htb_destroy_class call below
1244 	 * and surprisingly it worked in 2.4. But it must precede it
1245 	 * because filter need its target class alive to be able to call
1246 	 * unbind_filter on it (without Oops).
1247 	 */
1248 	tcf_destroy_chain(&q->filter_list);
1249 
1250 	for (i = 0; i < q->clhash.hashsize; i++) {
1251 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1252 			tcf_destroy_chain(&cl->filter_list);
1253 	}
1254 	for (i = 0; i < q->clhash.hashsize; i++) {
1255 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1256 					  common.hnode)
1257 			htb_destroy_class(sch, cl);
1258 	}
1259 	qdisc_class_hash_destroy(&q->clhash);
1260 	__skb_queue_purge(&q->direct_queue);
1261 }
1262 
1263 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1264 {
1265 	struct htb_sched *q = qdisc_priv(sch);
1266 	struct htb_class *cl = (struct htb_class *)arg;
1267 	unsigned int qlen;
1268 	struct Qdisc *new_q = NULL;
1269 	int last_child = 0;
1270 
1271 	// TODO: why don't allow to delete subtree ? references ? does
1272 	// tc subsys quarantee us that in htb_destroy it holds no class
1273 	// refs so that we can remove children safely there ?
1274 	if (cl->children || cl->filter_cnt)
1275 		return -EBUSY;
1276 
1277 	if (!cl->level && htb_parent_last_child(cl)) {
1278 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1279 					  cl->parent->common.classid);
1280 		last_child = 1;
1281 	}
1282 
1283 	sch_tree_lock(sch);
1284 
1285 	if (!cl->level) {
1286 		qlen = cl->un.leaf.q->q.qlen;
1287 		qdisc_reset(cl->un.leaf.q);
1288 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1289 	}
1290 
1291 	/* delete from hash and active; remainder in destroy_class */
1292 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1293 	if (cl->parent)
1294 		cl->parent->children--;
1295 
1296 	if (cl->prio_activity)
1297 		htb_deactivate(q, cl);
1298 
1299 	if (cl->cmode != HTB_CAN_SEND)
1300 		htb_safe_rb_erase(&cl->pq_node,
1301 				  &q->hlevel[cl->level].wait_pq);
1302 
1303 	if (last_child)
1304 		htb_parent_to_leaf(q, cl, new_q);
1305 
1306 	BUG_ON(--cl->refcnt == 0);
1307 	/*
1308 	 * This shouldn't happen: we "hold" one cops->get() when called
1309 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1310 	 */
1311 
1312 	sch_tree_unlock(sch);
1313 	return 0;
1314 }
1315 
1316 static void htb_put(struct Qdisc *sch, unsigned long arg)
1317 {
1318 	struct htb_class *cl = (struct htb_class *)arg;
1319 
1320 	if (--cl->refcnt == 0)
1321 		htb_destroy_class(sch, cl);
1322 }
1323 
1324 static int htb_change_class(struct Qdisc *sch, u32 classid,
1325 			    u32 parentid, struct nlattr **tca,
1326 			    unsigned long *arg)
1327 {
1328 	int err = -EINVAL;
1329 	struct htb_sched *q = qdisc_priv(sch);
1330 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1331 	struct nlattr *opt = tca[TCA_OPTIONS];
1332 	struct nlattr *tb[TCA_HTB_MAX + 1];
1333 	struct tc_htb_opt *hopt;
1334 
1335 	/* extract all subattrs from opt attr */
1336 	if (!opt)
1337 		goto failure;
1338 
1339 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1340 	if (err < 0)
1341 		goto failure;
1342 
1343 	err = -EINVAL;
1344 	if (tb[TCA_HTB_PARMS] == NULL)
1345 		goto failure;
1346 
1347 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1348 
1349 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1350 	if (!hopt->rate.rate || !hopt->ceil.rate)
1351 		goto failure;
1352 
1353 	if (!cl) {		/* new class */
1354 		struct Qdisc *new_q;
1355 		int prio;
1356 		struct {
1357 			struct nlattr		nla;
1358 			struct gnet_estimator	opt;
1359 		} est = {
1360 			.nla = {
1361 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1362 				.nla_type	= TCA_RATE,
1363 			},
1364 			.opt = {
1365 				/* 4s interval, 16s averaging constant */
1366 				.interval	= 2,
1367 				.ewma_log	= 2,
1368 			},
1369 		};
1370 
1371 		/* check for valid classid */
1372 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1373 		    htb_find(classid, sch))
1374 			goto failure;
1375 
1376 		/* check maximal depth */
1377 		if (parent && parent->parent && parent->parent->level < 2) {
1378 			pr_err("htb: tree is too deep\n");
1379 			goto failure;
1380 		}
1381 		err = -ENOBUFS;
1382 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1383 		if (!cl)
1384 			goto failure;
1385 
1386 		if (htb_rate_est || tca[TCA_RATE]) {
1387 			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1388 						qdisc_root_sleeping_lock(sch),
1389 						tca[TCA_RATE] ? : &est.nla);
1390 			if (err) {
1391 				kfree(cl);
1392 				goto failure;
1393 			}
1394 		}
1395 
1396 		cl->refcnt = 1;
1397 		cl->children = 0;
1398 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1399 		RB_CLEAR_NODE(&cl->pq_node);
1400 
1401 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1402 			RB_CLEAR_NODE(&cl->node[prio]);
1403 
1404 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1405 		 * so that can't be used inside of sch_tree_lock
1406 		 * -- thanks to Karlis Peisenieks
1407 		 */
1408 		new_q = qdisc_create_dflt(sch->dev_queue,
1409 					  &pfifo_qdisc_ops, classid);
1410 		sch_tree_lock(sch);
1411 		if (parent && !parent->level) {
1412 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1413 
1414 			/* turn parent into inner node */
1415 			qdisc_reset(parent->un.leaf.q);
1416 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1417 			qdisc_destroy(parent->un.leaf.q);
1418 			if (parent->prio_activity)
1419 				htb_deactivate(q, parent);
1420 
1421 			/* remove from evt list because of level change */
1422 			if (parent->cmode != HTB_CAN_SEND) {
1423 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1424 				parent->cmode = HTB_CAN_SEND;
1425 			}
1426 			parent->level = (parent->parent ? parent->parent->level
1427 					 : TC_HTB_MAXDEPTH) - 1;
1428 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1429 		}
1430 		/* leaf (we) needs elementary qdisc */
1431 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1432 
1433 		cl->common.classid = classid;
1434 		cl->parent = parent;
1435 
1436 		/* set class to be in HTB_CAN_SEND state */
1437 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1438 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1439 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1440 		cl->t_c = ktime_to_ns(ktime_get());
1441 		cl->cmode = HTB_CAN_SEND;
1442 
1443 		/* attach to the hash list and parent's family */
1444 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1445 		if (parent)
1446 			parent->children++;
1447 	} else {
1448 		if (tca[TCA_RATE]) {
1449 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1450 						    qdisc_root_sleeping_lock(sch),
1451 						    tca[TCA_RATE]);
1452 			if (err)
1453 				return err;
1454 		}
1455 		sch_tree_lock(sch);
1456 	}
1457 
1458 	/* it used to be a nasty bug here, we have to check that node
1459 	 * is really leaf before changing cl->un.leaf !
1460 	 */
1461 	if (!cl->level) {
1462 		cl->quantum = hopt->rate.rate / q->rate2quantum;
1463 		if (!hopt->quantum && cl->quantum < 1000) {
1464 			pr_warning(
1465 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1466 			       cl->common.classid);
1467 			cl->quantum = 1000;
1468 		}
1469 		if (!hopt->quantum && cl->quantum > 200000) {
1470 			pr_warning(
1471 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1472 			       cl->common.classid);
1473 			cl->quantum = 200000;
1474 		}
1475 		if (hopt->quantum)
1476 			cl->quantum = hopt->quantum;
1477 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1478 			cl->prio = TC_HTB_NUMPRIO - 1;
1479 	}
1480 
1481 	psched_ratecfg_precompute(&cl->rate, &hopt->rate);
1482 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil);
1483 
1484 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1485 	cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer);
1486 
1487 	sch_tree_unlock(sch);
1488 
1489 	qdisc_class_hash_grow(sch, &q->clhash);
1490 
1491 	*arg = (unsigned long)cl;
1492 	return 0;
1493 
1494 failure:
1495 	return err;
1496 }
1497 
1498 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1499 {
1500 	struct htb_sched *q = qdisc_priv(sch);
1501 	struct htb_class *cl = (struct htb_class *)arg;
1502 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1503 
1504 	return fl;
1505 }
1506 
1507 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1508 				     u32 classid)
1509 {
1510 	struct htb_class *cl = htb_find(classid, sch);
1511 
1512 	/*if (cl && !cl->level) return 0;
1513 	 * The line above used to be there to prevent attaching filters to
1514 	 * leaves. But at least tc_index filter uses this just to get class
1515 	 * for other reasons so that we have to allow for it.
1516 	 * ----
1517 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1518 	 * another way to "lock" the class - unlike "get" this lock can
1519 	 * be broken by class during destroy IIUC.
1520 	 */
1521 	if (cl)
1522 		cl->filter_cnt++;
1523 	return (unsigned long)cl;
1524 }
1525 
1526 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1527 {
1528 	struct htb_class *cl = (struct htb_class *)arg;
1529 
1530 	if (cl)
1531 		cl->filter_cnt--;
1532 }
1533 
1534 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1535 {
1536 	struct htb_sched *q = qdisc_priv(sch);
1537 	struct htb_class *cl;
1538 	unsigned int i;
1539 
1540 	if (arg->stop)
1541 		return;
1542 
1543 	for (i = 0; i < q->clhash.hashsize; i++) {
1544 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1545 			if (arg->count < arg->skip) {
1546 				arg->count++;
1547 				continue;
1548 			}
1549 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1550 				arg->stop = 1;
1551 				return;
1552 			}
1553 			arg->count++;
1554 		}
1555 	}
1556 }
1557 
1558 static const struct Qdisc_class_ops htb_class_ops = {
1559 	.graft		=	htb_graft,
1560 	.leaf		=	htb_leaf,
1561 	.qlen_notify	=	htb_qlen_notify,
1562 	.get		=	htb_get,
1563 	.put		=	htb_put,
1564 	.change		=	htb_change_class,
1565 	.delete		=	htb_delete,
1566 	.walk		=	htb_walk,
1567 	.tcf_chain	=	htb_find_tcf,
1568 	.bind_tcf	=	htb_bind_filter,
1569 	.unbind_tcf	=	htb_unbind_filter,
1570 	.dump		=	htb_dump_class,
1571 	.dump_stats	=	htb_dump_class_stats,
1572 };
1573 
1574 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1575 	.cl_ops		=	&htb_class_ops,
1576 	.id		=	"htb",
1577 	.priv_size	=	sizeof(struct htb_sched),
1578 	.enqueue	=	htb_enqueue,
1579 	.dequeue	=	htb_dequeue,
1580 	.peek		=	qdisc_peek_dequeued,
1581 	.drop		=	htb_drop,
1582 	.init		=	htb_init,
1583 	.reset		=	htb_reset,
1584 	.destroy	=	htb_destroy,
1585 	.dump		=	htb_dump,
1586 	.owner		=	THIS_MODULE,
1587 };
1588 
1589 static int __init htb_module_init(void)
1590 {
1591 	return register_qdisc(&htb_qdisc_ops);
1592 }
1593 static void __exit htb_module_exit(void)
1594 {
1595 	unregister_qdisc(&htb_qdisc_ops);
1596 }
1597 
1598 module_init(htb_module_init)
1599 module_exit(htb_module_exit)
1600 MODULE_LICENSE("GPL");
1601