xref: /openbmc/linux/net/sched/sch_htb.c (revision f42b3800)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  *
28  * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $
29  */
30 #include <linux/module.h>
31 #include <linux/types.h>
32 #include <linux/kernel.h>
33 #include <linux/string.h>
34 #include <linux/errno.h>
35 #include <linux/skbuff.h>
36 #include <linux/list.h>
37 #include <linux/compiler.h>
38 #include <linux/rbtree.h>
39 #include <net/netlink.h>
40 #include <net/pkt_sched.h>
41 
42 /* HTB algorithm.
43     Author: devik@cdi.cz
44     ========================================================================
45     HTB is like TBF with multiple classes. It is also similar to CBQ because
46     it allows to assign priority to each class in hierarchy.
47     In fact it is another implementation of Floyd's formal sharing.
48 
49     Levels:
50     Each class is assigned level. Leaf has ALWAYS level 0 and root
51     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
52     one less than their parent.
53 */
54 
55 #define HTB_HSIZE 16		/* classid hash size */
56 #define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
57 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
58 
59 #if HTB_VER >> 16 != TC_HTB_PROTOVER
60 #error "Mismatched sch_htb.c and pkt_sch.h"
61 #endif
62 
63 /* used internaly to keep status of single class */
64 enum htb_cmode {
65 	HTB_CANT_SEND,		/* class can't send and can't borrow */
66 	HTB_MAY_BORROW,		/* class can't send but may borrow */
67 	HTB_CAN_SEND		/* class can send */
68 };
69 
70 /* interior & leaf nodes; props specific to leaves are marked L: */
71 struct htb_class {
72 	/* general class parameters */
73 	u32 classid;
74 	struct gnet_stats_basic bstats;
75 	struct gnet_stats_queue qstats;
76 	struct gnet_stats_rate_est rate_est;
77 	struct tc_htb_xstats xstats;	/* our special stats */
78 	int refcnt;		/* usage count of this class */
79 
80 	/* topology */
81 	int level;		/* our level (see above) */
82 	struct htb_class *parent;	/* parent class */
83 	struct hlist_node hlist;	/* classid hash list item */
84 	struct list_head sibling;	/* sibling list item */
85 	struct list_head children;	/* children list */
86 
87 	union {
88 		struct htb_class_leaf {
89 			struct Qdisc *q;
90 			int prio;
91 			int aprio;
92 			int quantum;
93 			int deficit[TC_HTB_MAXDEPTH];
94 			struct list_head drop_list;
95 		} leaf;
96 		struct htb_class_inner {
97 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
98 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
99 			/* When class changes from state 1->2 and disconnects from
100 			   parent's feed then we lost ptr value and start from the
101 			   first child again. Here we store classid of the
102 			   last valid ptr (used when ptr is NULL). */
103 			u32 last_ptr_id[TC_HTB_NUMPRIO];
104 		} inner;
105 	} un;
106 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
107 	struct rb_node pq_node;	/* node for event queue */
108 	psched_time_t pq_key;
109 
110 	int prio_activity;	/* for which prios are we active */
111 	enum htb_cmode cmode;	/* current mode of the class */
112 
113 	/* class attached filters */
114 	struct tcf_proto *filter_list;
115 	int filter_cnt;
116 
117 	int warned;		/* only one warning about non work conserving .. */
118 
119 	/* token bucket parameters */
120 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122 	long buffer, cbuffer;	/* token bucket depth/rate */
123 	psched_tdiff_t mbuffer;	/* max wait time */
124 	long tokens, ctokens;	/* current number of tokens */
125 	psched_time_t t_c;	/* checkpoint time */
126 
127 	int prio;		/* For parent to leaf return possible here */
128 	int quantum;		/* we do backup. Finally full replacement  */
129 				/* of un.leaf originals should be done. */
130 };
131 
132 static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
133 			   int size)
134 {
135 	long result = qdisc_l2t(rate, size);
136 	return result;
137 }
138 
139 struct htb_sched {
140 	struct list_head root;	/* root classes list */
141 	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
142 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
143 
144 	/* self list - roots of self generating tree */
145 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
146 	int row_mask[TC_HTB_MAXDEPTH];
147 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
148 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
149 
150 	/* self wait list - roots of wait PQs per row */
151 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
152 
153 	/* time of nearest event per level (row) */
154 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
155 
156 	/* whether we hit non-work conserving class during this dequeue; we use */
157 	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
158 
159 	int defcls;		/* class where unclassified flows go to */
160 
161 	/* filters for qdisc itself */
162 	struct tcf_proto *filter_list;
163 	int filter_cnt;
164 
165 	int rate2quantum;	/* quant = rate / rate2quantum */
166 	psched_time_t now;	/* cached dequeue time */
167 	struct qdisc_watchdog watchdog;
168 
169 	/* non shaped skbs; let them go directly thru */
170 	struct sk_buff_head direct_queue;
171 	int direct_qlen;	/* max qlen of above */
172 
173 	long direct_pkts;
174 };
175 
176 /* compute hash of size HTB_HSIZE for given handle */
177 static inline int htb_hash(u32 h)
178 {
179 #if HTB_HSIZE != 16
180 #error "Declare new hash for your HTB_HSIZE"
181 #endif
182 	h ^= h >> 8;		/* stolen from cbq_hash */
183 	h ^= h >> 4;
184 	return h & 0xf;
185 }
186 
187 /* find class in global hash table using given handle */
188 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
189 {
190 	struct htb_sched *q = qdisc_priv(sch);
191 	struct hlist_node *p;
192 	struct htb_class *cl;
193 
194 	if (TC_H_MAJ(handle) != sch->handle)
195 		return NULL;
196 
197 	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
198 		if (cl->classid == handle)
199 			return cl;
200 	}
201 	return NULL;
202 }
203 
204 /**
205  * htb_classify - classify a packet into class
206  *
207  * It returns NULL if the packet should be dropped or -1 if the packet
208  * should be passed directly thru. In all other cases leaf class is returned.
209  * We allow direct class selection by classid in priority. The we examine
210  * filters in qdisc and in inner nodes (if higher filter points to the inner
211  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
212  * internal fifo (direct). These packets then go directly thru. If we still
213  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
214  * then finish and return direct queue.
215  */
216 #define HTB_DIRECT (struct htb_class*)-1
217 
218 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
219 				      int *qerr)
220 {
221 	struct htb_sched *q = qdisc_priv(sch);
222 	struct htb_class *cl;
223 	struct tcf_result res;
224 	struct tcf_proto *tcf;
225 	int result;
226 
227 	/* allow to select class by setting skb->priority to valid classid;
228 	   note that nfmark can be used too by attaching filter fw with no
229 	   rules in it */
230 	if (skb->priority == sch->handle)
231 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
232 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
233 		return cl;
234 
235 	*qerr = NET_XMIT_BYPASS;
236 	tcf = q->filter_list;
237 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
238 #ifdef CONFIG_NET_CLS_ACT
239 		switch (result) {
240 		case TC_ACT_QUEUED:
241 		case TC_ACT_STOLEN:
242 			*qerr = NET_XMIT_SUCCESS;
243 		case TC_ACT_SHOT:
244 			return NULL;
245 		}
246 #endif
247 		if ((cl = (void *)res.class) == NULL) {
248 			if (res.classid == sch->handle)
249 				return HTB_DIRECT;	/* X:0 (direct flow) */
250 			if ((cl = htb_find(res.classid, sch)) == NULL)
251 				break;	/* filter selected invalid classid */
252 		}
253 		if (!cl->level)
254 			return cl;	/* we hit leaf; return it */
255 
256 		/* we have got inner class; apply inner filter chain */
257 		tcf = cl->filter_list;
258 	}
259 	/* classification failed; try to use default class */
260 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
261 	if (!cl || cl->level)
262 		return HTB_DIRECT;	/* bad default .. this is safe bet */
263 	return cl;
264 }
265 
266 /**
267  * htb_add_to_id_tree - adds class to the round robin list
268  *
269  * Routine adds class to the list (actually tree) sorted by classid.
270  * Make sure that class is not already on such list for given prio.
271  */
272 static void htb_add_to_id_tree(struct rb_root *root,
273 			       struct htb_class *cl, int prio)
274 {
275 	struct rb_node **p = &root->rb_node, *parent = NULL;
276 
277 	while (*p) {
278 		struct htb_class *c;
279 		parent = *p;
280 		c = rb_entry(parent, struct htb_class, node[prio]);
281 
282 		if (cl->classid > c->classid)
283 			p = &parent->rb_right;
284 		else
285 			p = &parent->rb_left;
286 	}
287 	rb_link_node(&cl->node[prio], parent, p);
288 	rb_insert_color(&cl->node[prio], root);
289 }
290 
291 /**
292  * htb_add_to_wait_tree - adds class to the event queue with delay
293  *
294  * The class is added to priority event queue to indicate that class will
295  * change its mode in cl->pq_key microseconds. Make sure that class is not
296  * already in the queue.
297  */
298 static void htb_add_to_wait_tree(struct htb_sched *q,
299 				 struct htb_class *cl, long delay)
300 {
301 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
302 
303 	cl->pq_key = q->now + delay;
304 	if (cl->pq_key == q->now)
305 		cl->pq_key++;
306 
307 	/* update the nearest event cache */
308 	if (q->near_ev_cache[cl->level] > cl->pq_key)
309 		q->near_ev_cache[cl->level] = cl->pq_key;
310 
311 	while (*p) {
312 		struct htb_class *c;
313 		parent = *p;
314 		c = rb_entry(parent, struct htb_class, pq_node);
315 		if (cl->pq_key >= c->pq_key)
316 			p = &parent->rb_right;
317 		else
318 			p = &parent->rb_left;
319 	}
320 	rb_link_node(&cl->pq_node, parent, p);
321 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
322 }
323 
324 /**
325  * htb_next_rb_node - finds next node in binary tree
326  *
327  * When we are past last key we return NULL.
328  * Average complexity is 2 steps per call.
329  */
330 static inline void htb_next_rb_node(struct rb_node **n)
331 {
332 	*n = rb_next(*n);
333 }
334 
335 /**
336  * htb_add_class_to_row - add class to its row
337  *
338  * The class is added to row at priorities marked in mask.
339  * It does nothing if mask == 0.
340  */
341 static inline void htb_add_class_to_row(struct htb_sched *q,
342 					struct htb_class *cl, int mask)
343 {
344 	q->row_mask[cl->level] |= mask;
345 	while (mask) {
346 		int prio = ffz(~mask);
347 		mask &= ~(1 << prio);
348 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
349 	}
350 }
351 
352 /* If this triggers, it is a bug in this code, but it need not be fatal */
353 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
354 {
355 	if (RB_EMPTY_NODE(rb)) {
356 		WARN_ON(1);
357 	} else {
358 		rb_erase(rb, root);
359 		RB_CLEAR_NODE(rb);
360 	}
361 }
362 
363 
364 /**
365  * htb_remove_class_from_row - removes class from its row
366  *
367  * The class is removed from row at priorities marked in mask.
368  * It does nothing if mask == 0.
369  */
370 static inline void htb_remove_class_from_row(struct htb_sched *q,
371 						 struct htb_class *cl, int mask)
372 {
373 	int m = 0;
374 
375 	while (mask) {
376 		int prio = ffz(~mask);
377 
378 		mask &= ~(1 << prio);
379 		if (q->ptr[cl->level][prio] == cl->node + prio)
380 			htb_next_rb_node(q->ptr[cl->level] + prio);
381 
382 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
383 		if (!q->row[cl->level][prio].rb_node)
384 			m |= 1 << prio;
385 	}
386 	q->row_mask[cl->level] &= ~m;
387 }
388 
389 /**
390  * htb_activate_prios - creates active classe's feed chain
391  *
392  * The class is connected to ancestors and/or appropriate rows
393  * for priorities it is participating on. cl->cmode must be new
394  * (activated) mode. It does nothing if cl->prio_activity == 0.
395  */
396 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
397 {
398 	struct htb_class *p = cl->parent;
399 	long m, mask = cl->prio_activity;
400 
401 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
402 		m = mask;
403 		while (m) {
404 			int prio = ffz(~m);
405 			m &= ~(1 << prio);
406 
407 			if (p->un.inner.feed[prio].rb_node)
408 				/* parent already has its feed in use so that
409 				   reset bit in mask as parent is already ok */
410 				mask &= ~(1 << prio);
411 
412 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
413 		}
414 		p->prio_activity |= mask;
415 		cl = p;
416 		p = cl->parent;
417 
418 	}
419 	if (cl->cmode == HTB_CAN_SEND && mask)
420 		htb_add_class_to_row(q, cl, mask);
421 }
422 
423 /**
424  * htb_deactivate_prios - remove class from feed chain
425  *
426  * cl->cmode must represent old mode (before deactivation). It does
427  * nothing if cl->prio_activity == 0. Class is removed from all feed
428  * chains and rows.
429  */
430 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
431 {
432 	struct htb_class *p = cl->parent;
433 	long m, mask = cl->prio_activity;
434 
435 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
436 		m = mask;
437 		mask = 0;
438 		while (m) {
439 			int prio = ffz(~m);
440 			m &= ~(1 << prio);
441 
442 			if (p->un.inner.ptr[prio] == cl->node + prio) {
443 				/* we are removing child which is pointed to from
444 				   parent feed - forget the pointer but remember
445 				   classid */
446 				p->un.inner.last_ptr_id[prio] = cl->classid;
447 				p->un.inner.ptr[prio] = NULL;
448 			}
449 
450 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
451 
452 			if (!p->un.inner.feed[prio].rb_node)
453 				mask |= 1 << prio;
454 		}
455 
456 		p->prio_activity &= ~mask;
457 		cl = p;
458 		p = cl->parent;
459 
460 	}
461 	if (cl->cmode == HTB_CAN_SEND && mask)
462 		htb_remove_class_from_row(q, cl, mask);
463 }
464 
465 #if HTB_HYSTERESIS
466 static inline long htb_lowater(const struct htb_class *cl)
467 {
468 	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
469 }
470 static inline long htb_hiwater(const struct htb_class *cl)
471 {
472 	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
473 }
474 #else
475 #define htb_lowater(cl)	(0)
476 #define htb_hiwater(cl)	(0)
477 #endif
478 
479 /**
480  * htb_class_mode - computes and returns current class mode
481  *
482  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
483  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
484  * from now to time when cl will change its state.
485  * Also it is worth to note that class mode doesn't change simply
486  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
487  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
488  * mode transitions per time unit. The speed gain is about 1/6.
489  */
490 static inline enum htb_cmode
491 htb_class_mode(struct htb_class *cl, long *diff)
492 {
493 	long toks;
494 
495 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
496 		*diff = -toks;
497 		return HTB_CANT_SEND;
498 	}
499 
500 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
501 		return HTB_CAN_SEND;
502 
503 	*diff = -toks;
504 	return HTB_MAY_BORROW;
505 }
506 
507 /**
508  * htb_change_class_mode - changes classe's mode
509  *
510  * This should be the only way how to change classe's mode under normal
511  * cirsumstances. Routine will update feed lists linkage, change mode
512  * and add class to the wait event queue if appropriate. New mode should
513  * be different from old one and cl->pq_key has to be valid if changing
514  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
515  */
516 static void
517 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
518 {
519 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
520 
521 	if (new_mode == cl->cmode)
522 		return;
523 
524 	if (cl->prio_activity) {	/* not necessary: speed optimization */
525 		if (cl->cmode != HTB_CANT_SEND)
526 			htb_deactivate_prios(q, cl);
527 		cl->cmode = new_mode;
528 		if (new_mode != HTB_CANT_SEND)
529 			htb_activate_prios(q, cl);
530 	} else
531 		cl->cmode = new_mode;
532 }
533 
534 /**
535  * htb_activate - inserts leaf cl into appropriate active feeds
536  *
537  * Routine learns (new) priority of leaf and activates feed chain
538  * for the prio. It can be called on already active leaf safely.
539  * It also adds leaf into droplist.
540  */
541 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
542 {
543 	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
544 
545 	if (!cl->prio_activity) {
546 		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
547 		htb_activate_prios(q, cl);
548 		list_add_tail(&cl->un.leaf.drop_list,
549 			      q->drops + cl->un.leaf.aprio);
550 	}
551 }
552 
553 /**
554  * htb_deactivate - remove leaf cl from active feeds
555  *
556  * Make sure that leaf is active. In the other words it can't be called
557  * with non-active leaf. It also removes class from the drop list.
558  */
559 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
560 {
561 	BUG_TRAP(cl->prio_activity);
562 
563 	htb_deactivate_prios(q, cl);
564 	cl->prio_activity = 0;
565 	list_del_init(&cl->un.leaf.drop_list);
566 }
567 
568 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
569 {
570 	int ret;
571 	struct htb_sched *q = qdisc_priv(sch);
572 	struct htb_class *cl = htb_classify(skb, sch, &ret);
573 
574 	if (cl == HTB_DIRECT) {
575 		/* enqueue to helper queue */
576 		if (q->direct_queue.qlen < q->direct_qlen) {
577 			__skb_queue_tail(&q->direct_queue, skb);
578 			q->direct_pkts++;
579 		} else {
580 			kfree_skb(skb);
581 			sch->qstats.drops++;
582 			return NET_XMIT_DROP;
583 		}
584 #ifdef CONFIG_NET_CLS_ACT
585 	} else if (!cl) {
586 		if (ret == NET_XMIT_BYPASS)
587 			sch->qstats.drops++;
588 		kfree_skb(skb);
589 		return ret;
590 #endif
591 	} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
592 		   NET_XMIT_SUCCESS) {
593 		sch->qstats.drops++;
594 		cl->qstats.drops++;
595 		return NET_XMIT_DROP;
596 	} else {
597 		cl->bstats.packets +=
598 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
599 		cl->bstats.bytes += skb->len;
600 		htb_activate(q, cl);
601 	}
602 
603 	sch->q.qlen++;
604 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
605 	sch->bstats.bytes += skb->len;
606 	return NET_XMIT_SUCCESS;
607 }
608 
609 /* TODO: requeuing packet charges it to policers again !! */
610 static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
611 {
612 	int ret;
613 	struct htb_sched *q = qdisc_priv(sch);
614 	struct htb_class *cl = htb_classify(skb, sch, &ret);
615 	struct sk_buff *tskb;
616 
617 	if (cl == HTB_DIRECT) {
618 		/* enqueue to helper queue */
619 		if (q->direct_queue.qlen < q->direct_qlen) {
620 			__skb_queue_head(&q->direct_queue, skb);
621 		} else {
622 			__skb_queue_head(&q->direct_queue, skb);
623 			tskb = __skb_dequeue_tail(&q->direct_queue);
624 			kfree_skb(tskb);
625 			sch->qstats.drops++;
626 			return NET_XMIT_CN;
627 		}
628 #ifdef CONFIG_NET_CLS_ACT
629 	} else if (!cl) {
630 		if (ret == NET_XMIT_BYPASS)
631 			sch->qstats.drops++;
632 		kfree_skb(skb);
633 		return ret;
634 #endif
635 	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
636 		   NET_XMIT_SUCCESS) {
637 		sch->qstats.drops++;
638 		cl->qstats.drops++;
639 		return NET_XMIT_DROP;
640 	} else
641 		htb_activate(q, cl);
642 
643 	sch->q.qlen++;
644 	sch->qstats.requeues++;
645 	return NET_XMIT_SUCCESS;
646 }
647 
648 /**
649  * htb_charge_class - charges amount "bytes" to leaf and ancestors
650  *
651  * Routine assumes that packet "bytes" long was dequeued from leaf cl
652  * borrowing from "level". It accounts bytes to ceil leaky bucket for
653  * leaf and all ancestors and to rate bucket for ancestors at levels
654  * "level" and higher. It also handles possible change of mode resulting
655  * from the update. Note that mode can also increase here (MAY_BORROW to
656  * CAN_SEND) because we can use more precise clock that event queue here.
657  * In such case we remove class from event queue first.
658  */
659 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
660 			     int level, struct sk_buff *skb)
661 {
662 	int bytes = skb->len;
663 	long toks, diff;
664 	enum htb_cmode old_mode;
665 
666 #define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
667 	if (toks > cl->B) toks = cl->B; \
668 	toks -= L2T(cl, cl->R, bytes); \
669 	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
670 	cl->T = toks
671 
672 	while (cl) {
673 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
674 		if (cl->level >= level) {
675 			if (cl->level == level)
676 				cl->xstats.lends++;
677 			HTB_ACCNT(tokens, buffer, rate);
678 		} else {
679 			cl->xstats.borrows++;
680 			cl->tokens += diff;	/* we moved t_c; update tokens */
681 		}
682 		HTB_ACCNT(ctokens, cbuffer, ceil);
683 		cl->t_c = q->now;
684 
685 		old_mode = cl->cmode;
686 		diff = 0;
687 		htb_change_class_mode(q, cl, &diff);
688 		if (old_mode != cl->cmode) {
689 			if (old_mode != HTB_CAN_SEND)
690 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
691 			if (cl->cmode != HTB_CAN_SEND)
692 				htb_add_to_wait_tree(q, cl, diff);
693 		}
694 
695 		/* update byte stats except for leaves which are already updated */
696 		if (cl->level) {
697 			cl->bstats.bytes += bytes;
698 			cl->bstats.packets += skb_is_gso(skb)?
699 					skb_shinfo(skb)->gso_segs:1;
700 		}
701 		cl = cl->parent;
702 	}
703 }
704 
705 /**
706  * htb_do_events - make mode changes to classes at the level
707  *
708  * Scans event queue for pending events and applies them. Returns time of
709  * next pending event (0 for no event in pq).
710  * Note: Applied are events whose have cl->pq_key <= q->now.
711  */
712 static psched_time_t htb_do_events(struct htb_sched *q, int level)
713 {
714 	/* don't run for longer than 2 jiffies; 2 is used instead of
715 	   1 to simplify things when jiffy is going to be incremented
716 	   too soon */
717 	unsigned long stop_at = jiffies + 2;
718 	while (time_before(jiffies, stop_at)) {
719 		struct htb_class *cl;
720 		long diff;
721 		struct rb_node *p = rb_first(&q->wait_pq[level]);
722 
723 		if (!p)
724 			return 0;
725 
726 		cl = rb_entry(p, struct htb_class, pq_node);
727 		if (cl->pq_key > q->now)
728 			return cl->pq_key;
729 
730 		htb_safe_rb_erase(p, q->wait_pq + level);
731 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
732 		htb_change_class_mode(q, cl, &diff);
733 		if (cl->cmode != HTB_CAN_SEND)
734 			htb_add_to_wait_tree(q, cl, diff);
735 	}
736 	/* too much load - let's continue on next jiffie */
737 	return q->now + PSCHED_TICKS_PER_SEC / HZ;
738 }
739 
740 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
741    is no such one exists. */
742 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
743 					      u32 id)
744 {
745 	struct rb_node *r = NULL;
746 	while (n) {
747 		struct htb_class *cl =
748 		    rb_entry(n, struct htb_class, node[prio]);
749 		if (id == cl->classid)
750 			return n;
751 
752 		if (id > cl->classid) {
753 			n = n->rb_right;
754 		} else {
755 			r = n;
756 			n = n->rb_left;
757 		}
758 	}
759 	return r;
760 }
761 
762 /**
763  * htb_lookup_leaf - returns next leaf class in DRR order
764  *
765  * Find leaf where current feed pointers points to.
766  */
767 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
768 					 struct rb_node **pptr, u32 * pid)
769 {
770 	int i;
771 	struct {
772 		struct rb_node *root;
773 		struct rb_node **pptr;
774 		u32 *pid;
775 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
776 
777 	BUG_TRAP(tree->rb_node);
778 	sp->root = tree->rb_node;
779 	sp->pptr = pptr;
780 	sp->pid = pid;
781 
782 	for (i = 0; i < 65535; i++) {
783 		if (!*sp->pptr && *sp->pid) {
784 			/* ptr was invalidated but id is valid - try to recover
785 			   the original or next ptr */
786 			*sp->pptr =
787 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
788 		}
789 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
790 				   can become out of date quickly */
791 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
792 			*sp->pptr = sp->root;
793 			while ((*sp->pptr)->rb_left)
794 				*sp->pptr = (*sp->pptr)->rb_left;
795 			if (sp > stk) {
796 				sp--;
797 				BUG_TRAP(*sp->pptr);
798 				if (!*sp->pptr)
799 					return NULL;
800 				htb_next_rb_node(sp->pptr);
801 			}
802 		} else {
803 			struct htb_class *cl;
804 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
805 			if (!cl->level)
806 				return cl;
807 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
808 			sp->pptr = cl->un.inner.ptr + prio;
809 			sp->pid = cl->un.inner.last_ptr_id + prio;
810 		}
811 	}
812 	BUG_TRAP(0);
813 	return NULL;
814 }
815 
816 /* dequeues packet at given priority and level; call only if
817    you are sure that there is active class at prio/level */
818 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
819 					int level)
820 {
821 	struct sk_buff *skb = NULL;
822 	struct htb_class *cl, *start;
823 	/* look initial class up in the row */
824 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
825 				     q->ptr[level] + prio,
826 				     q->last_ptr_id[level] + prio);
827 
828 	do {
829 next:
830 		BUG_TRAP(cl);
831 		if (!cl)
832 			return NULL;
833 
834 		/* class can be empty - it is unlikely but can be true if leaf
835 		   qdisc drops packets in enqueue routine or if someone used
836 		   graft operation on the leaf since last dequeue;
837 		   simply deactivate and skip such class */
838 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
839 			struct htb_class *next;
840 			htb_deactivate(q, cl);
841 
842 			/* row/level might become empty */
843 			if ((q->row_mask[level] & (1 << prio)) == 0)
844 				return NULL;
845 
846 			next = htb_lookup_leaf(q->row[level] + prio,
847 					       prio, q->ptr[level] + prio,
848 					       q->last_ptr_id[level] + prio);
849 
850 			if (cl == start)	/* fix start if we just deleted it */
851 				start = next;
852 			cl = next;
853 			goto next;
854 		}
855 
856 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
857 		if (likely(skb != NULL))
858 			break;
859 		if (!cl->warned) {
860 			printk(KERN_WARNING
861 			       "htb: class %X isn't work conserving ?!\n",
862 			       cl->classid);
863 			cl->warned = 1;
864 		}
865 		q->nwc_hit++;
866 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
867 				  ptr[0]) + prio);
868 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
869 				     q->ptr[level] + prio,
870 				     q->last_ptr_id[level] + prio);
871 
872 	} while (cl != start);
873 
874 	if (likely(skb != NULL)) {
875 		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
876 			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
877 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
878 					  ptr[0]) + prio);
879 		}
880 		/* this used to be after charge_class but this constelation
881 		   gives us slightly better performance */
882 		if (!cl->un.leaf.q->q.qlen)
883 			htb_deactivate(q, cl);
884 		htb_charge_class(q, cl, level, skb);
885 	}
886 	return skb;
887 }
888 
889 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
890 {
891 	struct sk_buff *skb = NULL;
892 	struct htb_sched *q = qdisc_priv(sch);
893 	int level;
894 	psched_time_t next_event;
895 
896 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
897 	skb = __skb_dequeue(&q->direct_queue);
898 	if (skb != NULL) {
899 		sch->flags &= ~TCQ_F_THROTTLED;
900 		sch->q.qlen--;
901 		return skb;
902 	}
903 
904 	if (!sch->q.qlen)
905 		goto fin;
906 	q->now = psched_get_time();
907 
908 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
909 	q->nwc_hit = 0;
910 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
911 		/* common case optimization - skip event handler quickly */
912 		int m;
913 		psched_time_t event;
914 
915 		if (q->now >= q->near_ev_cache[level]) {
916 			event = htb_do_events(q, level);
917 			if (!event)
918 				event = q->now + PSCHED_TICKS_PER_SEC;
919 			q->near_ev_cache[level] = event;
920 		} else
921 			event = q->near_ev_cache[level];
922 
923 		if (event && next_event > event)
924 			next_event = event;
925 
926 		m = ~q->row_mask[level];
927 		while (m != (int)(-1)) {
928 			int prio = ffz(m);
929 			m |= 1 << prio;
930 			skb = htb_dequeue_tree(q, prio, level);
931 			if (likely(skb != NULL)) {
932 				sch->q.qlen--;
933 				sch->flags &= ~TCQ_F_THROTTLED;
934 				goto fin;
935 			}
936 		}
937 	}
938 	sch->qstats.overlimits++;
939 	qdisc_watchdog_schedule(&q->watchdog, next_event);
940 fin:
941 	return skb;
942 }
943 
944 /* try to drop from each class (by prio) until one succeed */
945 static unsigned int htb_drop(struct Qdisc *sch)
946 {
947 	struct htb_sched *q = qdisc_priv(sch);
948 	int prio;
949 
950 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
951 		struct list_head *p;
952 		list_for_each(p, q->drops + prio) {
953 			struct htb_class *cl = list_entry(p, struct htb_class,
954 							  un.leaf.drop_list);
955 			unsigned int len;
956 			if (cl->un.leaf.q->ops->drop &&
957 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
958 				sch->q.qlen--;
959 				if (!cl->un.leaf.q->q.qlen)
960 					htb_deactivate(q, cl);
961 				return len;
962 			}
963 		}
964 	}
965 	return 0;
966 }
967 
968 /* reset all classes */
969 /* always caled under BH & queue lock */
970 static void htb_reset(struct Qdisc *sch)
971 {
972 	struct htb_sched *q = qdisc_priv(sch);
973 	int i;
974 
975 	for (i = 0; i < HTB_HSIZE; i++) {
976 		struct hlist_node *p;
977 		struct htb_class *cl;
978 
979 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
980 			if (cl->level)
981 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
982 			else {
983 				if (cl->un.leaf.q)
984 					qdisc_reset(cl->un.leaf.q);
985 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
986 			}
987 			cl->prio_activity = 0;
988 			cl->cmode = HTB_CAN_SEND;
989 
990 		}
991 	}
992 	qdisc_watchdog_cancel(&q->watchdog);
993 	__skb_queue_purge(&q->direct_queue);
994 	sch->q.qlen = 0;
995 	memset(q->row, 0, sizeof(q->row));
996 	memset(q->row_mask, 0, sizeof(q->row_mask));
997 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
998 	memset(q->ptr, 0, sizeof(q->ptr));
999 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1000 		INIT_LIST_HEAD(q->drops + i);
1001 }
1002 
1003 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1004 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1005 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1006 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1007 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1008 };
1009 
1010 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1011 {
1012 	struct htb_sched *q = qdisc_priv(sch);
1013 	struct nlattr *tb[TCA_HTB_INIT + 1];
1014 	struct tc_htb_glob *gopt;
1015 	int err;
1016 	int i;
1017 
1018 	if (!opt)
1019 		return -EINVAL;
1020 
1021 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1022 	if (err < 0)
1023 		return err;
1024 
1025 	if (tb[TCA_HTB_INIT] == NULL) {
1026 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1027 		return -EINVAL;
1028 	}
1029 	gopt = nla_data(tb[TCA_HTB_INIT]);
1030 	if (gopt->version != HTB_VER >> 16) {
1031 		printk(KERN_ERR
1032 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1033 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1034 		return -EINVAL;
1035 	}
1036 
1037 	INIT_LIST_HEAD(&q->root);
1038 	for (i = 0; i < HTB_HSIZE; i++)
1039 		INIT_HLIST_HEAD(q->hash + i);
1040 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1041 		INIT_LIST_HEAD(q->drops + i);
1042 
1043 	qdisc_watchdog_init(&q->watchdog, sch);
1044 	skb_queue_head_init(&q->direct_queue);
1045 
1046 	q->direct_qlen = sch->dev->tx_queue_len;
1047 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1048 		q->direct_qlen = 2;
1049 
1050 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1051 		q->rate2quantum = 1;
1052 	q->defcls = gopt->defcls;
1053 
1054 	return 0;
1055 }
1056 
1057 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1058 {
1059 	struct htb_sched *q = qdisc_priv(sch);
1060 	struct nlattr *nest;
1061 	struct tc_htb_glob gopt;
1062 
1063 	spin_lock_bh(&sch->dev->queue_lock);
1064 
1065 	gopt.direct_pkts = q->direct_pkts;
1066 	gopt.version = HTB_VER;
1067 	gopt.rate2quantum = q->rate2quantum;
1068 	gopt.defcls = q->defcls;
1069 	gopt.debug = 0;
1070 
1071 	nest = nla_nest_start(skb, TCA_OPTIONS);
1072 	if (nest == NULL)
1073 		goto nla_put_failure;
1074 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1075 	nla_nest_end(skb, nest);
1076 
1077 	spin_unlock_bh(&sch->dev->queue_lock);
1078 	return skb->len;
1079 
1080 nla_put_failure:
1081 	spin_unlock_bh(&sch->dev->queue_lock);
1082 	nla_nest_cancel(skb, nest);
1083 	return -1;
1084 }
1085 
1086 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1087 			  struct sk_buff *skb, struct tcmsg *tcm)
1088 {
1089 	struct htb_class *cl = (struct htb_class *)arg;
1090 	struct nlattr *nest;
1091 	struct tc_htb_opt opt;
1092 
1093 	spin_lock_bh(&sch->dev->queue_lock);
1094 	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1095 	tcm->tcm_handle = cl->classid;
1096 	if (!cl->level && cl->un.leaf.q)
1097 		tcm->tcm_info = cl->un.leaf.q->handle;
1098 
1099 	nest = nla_nest_start(skb, TCA_OPTIONS);
1100 	if (nest == NULL)
1101 		goto nla_put_failure;
1102 
1103 	memset(&opt, 0, sizeof(opt));
1104 
1105 	opt.rate = cl->rate->rate;
1106 	opt.buffer = cl->buffer;
1107 	opt.ceil = cl->ceil->rate;
1108 	opt.cbuffer = cl->cbuffer;
1109 	opt.quantum = cl->un.leaf.quantum;
1110 	opt.prio = cl->un.leaf.prio;
1111 	opt.level = cl->level;
1112 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1113 
1114 	nla_nest_end(skb, nest);
1115 	spin_unlock_bh(&sch->dev->queue_lock);
1116 	return skb->len;
1117 
1118 nla_put_failure:
1119 	spin_unlock_bh(&sch->dev->queue_lock);
1120 	nla_nest_cancel(skb, nest);
1121 	return -1;
1122 }
1123 
1124 static int
1125 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1126 {
1127 	struct htb_class *cl = (struct htb_class *)arg;
1128 
1129 	if (!cl->level && cl->un.leaf.q)
1130 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1131 	cl->xstats.tokens = cl->tokens;
1132 	cl->xstats.ctokens = cl->ctokens;
1133 
1134 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1135 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1136 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1137 		return -1;
1138 
1139 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1140 }
1141 
1142 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1143 		     struct Qdisc **old)
1144 {
1145 	struct htb_class *cl = (struct htb_class *)arg;
1146 
1147 	if (cl && !cl->level) {
1148 		if (new == NULL &&
1149 		    (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1150 					     cl->classid))
1151 		    == NULL)
1152 			return -ENOBUFS;
1153 		sch_tree_lock(sch);
1154 		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1155 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1156 			qdisc_reset(*old);
1157 		}
1158 		sch_tree_unlock(sch);
1159 		return 0;
1160 	}
1161 	return -ENOENT;
1162 }
1163 
1164 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1165 {
1166 	struct htb_class *cl = (struct htb_class *)arg;
1167 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1168 }
1169 
1170 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1171 {
1172 	struct htb_class *cl = (struct htb_class *)arg;
1173 
1174 	if (cl->un.leaf.q->q.qlen == 0)
1175 		htb_deactivate(qdisc_priv(sch), cl);
1176 }
1177 
1178 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1179 {
1180 	struct htb_class *cl = htb_find(classid, sch);
1181 	if (cl)
1182 		cl->refcnt++;
1183 	return (unsigned long)cl;
1184 }
1185 
1186 static inline int htb_parent_last_child(struct htb_class *cl)
1187 {
1188 	if (!cl->parent)
1189 		/* the root class */
1190 		return 0;
1191 
1192 	if (!(cl->parent->children.next == &cl->sibling &&
1193 		cl->parent->children.prev == &cl->sibling))
1194 		/* not the last child */
1195 		return 0;
1196 
1197 	return 1;
1198 }
1199 
1200 static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
1201 {
1202 	struct htb_class *parent = cl->parent;
1203 
1204 	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1205 
1206 	parent->level = 0;
1207 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1208 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1209 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1210 	parent->un.leaf.quantum = parent->quantum;
1211 	parent->un.leaf.prio = parent->prio;
1212 	parent->tokens = parent->buffer;
1213 	parent->ctokens = parent->cbuffer;
1214 	parent->t_c = psched_get_time();
1215 	parent->cmode = HTB_CAN_SEND;
1216 }
1217 
1218 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1219 {
1220 	struct htb_sched *q = qdisc_priv(sch);
1221 
1222 	if (!cl->level) {
1223 		BUG_TRAP(cl->un.leaf.q);
1224 		qdisc_destroy(cl->un.leaf.q);
1225 	}
1226 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1227 	qdisc_put_rtab(cl->rate);
1228 	qdisc_put_rtab(cl->ceil);
1229 
1230 	tcf_destroy_chain(cl->filter_list);
1231 
1232 	while (!list_empty(&cl->children))
1233 		htb_destroy_class(sch, list_entry(cl->children.next,
1234 						  struct htb_class, sibling));
1235 
1236 	/* note: this delete may happen twice (see htb_delete) */
1237 	hlist_del_init(&cl->hlist);
1238 	list_del(&cl->sibling);
1239 
1240 	if (cl->prio_activity)
1241 		htb_deactivate(q, cl);
1242 
1243 	if (cl->cmode != HTB_CAN_SEND)
1244 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1245 
1246 	kfree(cl);
1247 }
1248 
1249 /* always caled under BH & queue lock */
1250 static void htb_destroy(struct Qdisc *sch)
1251 {
1252 	struct htb_sched *q = qdisc_priv(sch);
1253 
1254 	qdisc_watchdog_cancel(&q->watchdog);
1255 	/* This line used to be after htb_destroy_class call below
1256 	   and surprisingly it worked in 2.4. But it must precede it
1257 	   because filter need its target class alive to be able to call
1258 	   unbind_filter on it (without Oops). */
1259 	tcf_destroy_chain(q->filter_list);
1260 
1261 	while (!list_empty(&q->root))
1262 		htb_destroy_class(sch, list_entry(q->root.next,
1263 						  struct htb_class, sibling));
1264 
1265 	__skb_queue_purge(&q->direct_queue);
1266 }
1267 
1268 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1269 {
1270 	struct htb_sched *q = qdisc_priv(sch);
1271 	struct htb_class *cl = (struct htb_class *)arg;
1272 	unsigned int qlen;
1273 	struct Qdisc *new_q = NULL;
1274 	int last_child = 0;
1275 
1276 	// TODO: why don't allow to delete subtree ? references ? does
1277 	// tc subsys quarantee us that in htb_destroy it holds no class
1278 	// refs so that we can remove children safely there ?
1279 	if (!list_empty(&cl->children) || cl->filter_cnt)
1280 		return -EBUSY;
1281 
1282 	if (!cl->level && htb_parent_last_child(cl)) {
1283 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1284 						cl->parent->classid);
1285 		last_child = 1;
1286 	}
1287 
1288 	sch_tree_lock(sch);
1289 
1290 	if (!cl->level) {
1291 		qlen = cl->un.leaf.q->q.qlen;
1292 		qdisc_reset(cl->un.leaf.q);
1293 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1294 	}
1295 
1296 	/* delete from hash and active; remainder in destroy_class */
1297 	hlist_del_init(&cl->hlist);
1298 
1299 	if (cl->prio_activity)
1300 		htb_deactivate(q, cl);
1301 
1302 	if (last_child)
1303 		htb_parent_to_leaf(cl, new_q);
1304 
1305 	if (--cl->refcnt == 0)
1306 		htb_destroy_class(sch, cl);
1307 
1308 	sch_tree_unlock(sch);
1309 	return 0;
1310 }
1311 
1312 static void htb_put(struct Qdisc *sch, unsigned long arg)
1313 {
1314 	struct htb_class *cl = (struct htb_class *)arg;
1315 
1316 	if (--cl->refcnt == 0)
1317 		htb_destroy_class(sch, cl);
1318 }
1319 
1320 static int htb_change_class(struct Qdisc *sch, u32 classid,
1321 			    u32 parentid, struct nlattr **tca,
1322 			    unsigned long *arg)
1323 {
1324 	int err = -EINVAL;
1325 	struct htb_sched *q = qdisc_priv(sch);
1326 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1327 	struct nlattr *opt = tca[TCA_OPTIONS];
1328 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1329 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1330 	struct tc_htb_opt *hopt;
1331 
1332 	/* extract all subattrs from opt attr */
1333 	if (!opt)
1334 		goto failure;
1335 
1336 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1337 	if (err < 0)
1338 		goto failure;
1339 
1340 	err = -EINVAL;
1341 	if (tb[TCA_HTB_PARMS] == NULL)
1342 		goto failure;
1343 
1344 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1345 
1346 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1347 
1348 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1349 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1350 	if (!rtab || !ctab)
1351 		goto failure;
1352 
1353 	if (!cl) {		/* new class */
1354 		struct Qdisc *new_q;
1355 		int prio;
1356 		struct {
1357 			struct nlattr		nla;
1358 			struct gnet_estimator	opt;
1359 		} est = {
1360 			.nla = {
1361 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1362 				.nla_type	= TCA_RATE,
1363 			},
1364 			.opt = {
1365 				/* 4s interval, 16s averaging constant */
1366 				.interval	= 2,
1367 				.ewma_log	= 2,
1368 			},
1369 		};
1370 
1371 		/* check for valid classid */
1372 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1373 		    || htb_find(classid, sch))
1374 			goto failure;
1375 
1376 		/* check maximal depth */
1377 		if (parent && parent->parent && parent->parent->level < 2) {
1378 			printk(KERN_ERR "htb: tree is too deep\n");
1379 			goto failure;
1380 		}
1381 		err = -ENOBUFS;
1382 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1383 			goto failure;
1384 
1385 		gen_new_estimator(&cl->bstats, &cl->rate_est,
1386 				  &sch->dev->queue_lock,
1387 				  tca[TCA_RATE] ? : &est.nla);
1388 		cl->refcnt = 1;
1389 		INIT_LIST_HEAD(&cl->sibling);
1390 		INIT_HLIST_NODE(&cl->hlist);
1391 		INIT_LIST_HEAD(&cl->children);
1392 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1393 		RB_CLEAR_NODE(&cl->pq_node);
1394 
1395 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1396 			RB_CLEAR_NODE(&cl->node[prio]);
1397 
1398 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1399 		   so that can't be used inside of sch_tree_lock
1400 		   -- thanks to Karlis Peisenieks */
1401 		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1402 		sch_tree_lock(sch);
1403 		if (parent && !parent->level) {
1404 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1405 
1406 			/* turn parent into inner node */
1407 			qdisc_reset(parent->un.leaf.q);
1408 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1409 			qdisc_destroy(parent->un.leaf.q);
1410 			if (parent->prio_activity)
1411 				htb_deactivate(q, parent);
1412 
1413 			/* remove from evt list because of level change */
1414 			if (parent->cmode != HTB_CAN_SEND) {
1415 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1416 				parent->cmode = HTB_CAN_SEND;
1417 			}
1418 			parent->level = (parent->parent ? parent->parent->level
1419 					 : TC_HTB_MAXDEPTH) - 1;
1420 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1421 		}
1422 		/* leaf (we) needs elementary qdisc */
1423 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1424 
1425 		cl->classid = classid;
1426 		cl->parent = parent;
1427 
1428 		/* set class to be in HTB_CAN_SEND state */
1429 		cl->tokens = hopt->buffer;
1430 		cl->ctokens = hopt->cbuffer;
1431 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1432 		cl->t_c = psched_get_time();
1433 		cl->cmode = HTB_CAN_SEND;
1434 
1435 		/* attach to the hash list and parent's family */
1436 		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
1437 		list_add_tail(&cl->sibling,
1438 			      parent ? &parent->children : &q->root);
1439 	} else {
1440 		if (tca[TCA_RATE])
1441 			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1442 					      &sch->dev->queue_lock,
1443 					      tca[TCA_RATE]);
1444 		sch_tree_lock(sch);
1445 	}
1446 
1447 	/* it used to be a nasty bug here, we have to check that node
1448 	   is really leaf before changing cl->un.leaf ! */
1449 	if (!cl->level) {
1450 		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1451 		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1452 			printk(KERN_WARNING
1453 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1454 			       cl->classid);
1455 			cl->un.leaf.quantum = 1000;
1456 		}
1457 		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1458 			printk(KERN_WARNING
1459 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1460 			       cl->classid);
1461 			cl->un.leaf.quantum = 200000;
1462 		}
1463 		if (hopt->quantum)
1464 			cl->un.leaf.quantum = hopt->quantum;
1465 		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1466 			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1467 
1468 		/* backup for htb_parent_to_leaf */
1469 		cl->quantum = cl->un.leaf.quantum;
1470 		cl->prio = cl->un.leaf.prio;
1471 	}
1472 
1473 	cl->buffer = hopt->buffer;
1474 	cl->cbuffer = hopt->cbuffer;
1475 	if (cl->rate)
1476 		qdisc_put_rtab(cl->rate);
1477 	cl->rate = rtab;
1478 	if (cl->ceil)
1479 		qdisc_put_rtab(cl->ceil);
1480 	cl->ceil = ctab;
1481 	sch_tree_unlock(sch);
1482 
1483 	*arg = (unsigned long)cl;
1484 	return 0;
1485 
1486 failure:
1487 	if (rtab)
1488 		qdisc_put_rtab(rtab);
1489 	if (ctab)
1490 		qdisc_put_rtab(ctab);
1491 	return err;
1492 }
1493 
1494 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1495 {
1496 	struct htb_sched *q = qdisc_priv(sch);
1497 	struct htb_class *cl = (struct htb_class *)arg;
1498 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1499 
1500 	return fl;
1501 }
1502 
1503 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1504 				     u32 classid)
1505 {
1506 	struct htb_sched *q = qdisc_priv(sch);
1507 	struct htb_class *cl = htb_find(classid, sch);
1508 
1509 	/*if (cl && !cl->level) return 0;
1510 	   The line above used to be there to prevent attaching filters to
1511 	   leaves. But at least tc_index filter uses this just to get class
1512 	   for other reasons so that we have to allow for it.
1513 	   ----
1514 	   19.6.2002 As Werner explained it is ok - bind filter is just
1515 	   another way to "lock" the class - unlike "get" this lock can
1516 	   be broken by class during destroy IIUC.
1517 	 */
1518 	if (cl)
1519 		cl->filter_cnt++;
1520 	else
1521 		q->filter_cnt++;
1522 	return (unsigned long)cl;
1523 }
1524 
1525 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1526 {
1527 	struct htb_sched *q = qdisc_priv(sch);
1528 	struct htb_class *cl = (struct htb_class *)arg;
1529 
1530 	if (cl)
1531 		cl->filter_cnt--;
1532 	else
1533 		q->filter_cnt--;
1534 }
1535 
1536 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1537 {
1538 	struct htb_sched *q = qdisc_priv(sch);
1539 	int i;
1540 
1541 	if (arg->stop)
1542 		return;
1543 
1544 	for (i = 0; i < HTB_HSIZE; i++) {
1545 		struct hlist_node *p;
1546 		struct htb_class *cl;
1547 
1548 		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1549 			if (arg->count < arg->skip) {
1550 				arg->count++;
1551 				continue;
1552 			}
1553 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1554 				arg->stop = 1;
1555 				return;
1556 			}
1557 			arg->count++;
1558 		}
1559 	}
1560 }
1561 
1562 static const struct Qdisc_class_ops htb_class_ops = {
1563 	.graft		=	htb_graft,
1564 	.leaf		=	htb_leaf,
1565 	.qlen_notify	=	htb_qlen_notify,
1566 	.get		=	htb_get,
1567 	.put		=	htb_put,
1568 	.change		=	htb_change_class,
1569 	.delete		=	htb_delete,
1570 	.walk		=	htb_walk,
1571 	.tcf_chain	=	htb_find_tcf,
1572 	.bind_tcf	=	htb_bind_filter,
1573 	.unbind_tcf	=	htb_unbind_filter,
1574 	.dump		=	htb_dump_class,
1575 	.dump_stats	=	htb_dump_class_stats,
1576 };
1577 
1578 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1579 	.next		=	NULL,
1580 	.cl_ops		=	&htb_class_ops,
1581 	.id		=	"htb",
1582 	.priv_size	=	sizeof(struct htb_sched),
1583 	.enqueue	=	htb_enqueue,
1584 	.dequeue	=	htb_dequeue,
1585 	.requeue	=	htb_requeue,
1586 	.drop		=	htb_drop,
1587 	.init		=	htb_init,
1588 	.reset		=	htb_reset,
1589 	.destroy	=	htb_destroy,
1590 	.change		=	NULL /* htb_change */,
1591 	.dump		=	htb_dump,
1592 	.owner		=	THIS_MODULE,
1593 };
1594 
1595 static int __init htb_module_init(void)
1596 {
1597 	return register_qdisc(&htb_qdisc_ops);
1598 }
1599 static void __exit htb_module_exit(void)
1600 {
1601 	unregister_qdisc(&htb_qdisc_ops);
1602 }
1603 
1604 module_init(htb_module_init)
1605 module_exit(htb_module_exit)
1606 MODULE_LICENSE("GPL");
1607