xref: /openbmc/linux/net/sched/sch_htb.c (revision fea966f7)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <net/netlink.h>
40 #include <net/pkt_sched.h>
41 
42 /* HTB algorithm.
43     Author: devik@cdi.cz
44     ========================================================================
45     HTB is like TBF with multiple classes. It is also similar to CBQ because
46     it allows to assign priority to each class in hierarchy.
47     In fact it is another implementation of Floyd's formal sharing.
48 
49     Levels:
50     Each class is assigned level. Leaf has ALWAYS level 0 and root
51     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
52     one less than their parent.
53 */
54 
55 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
56 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
57 
58 #if HTB_VER >> 16 != TC_HTB_PROTOVER
59 #error "Mismatched sch_htb.c and pkt_sch.h"
60 #endif
61 
62 /* Module parameter and sysfs export */
63 module_param    (htb_hysteresis, int, 0640);
64 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
65 
66 /* used internaly to keep status of single class */
67 enum htb_cmode {
68 	HTB_CANT_SEND,		/* class can't send and can't borrow */
69 	HTB_MAY_BORROW,		/* class can't send but may borrow */
70 	HTB_CAN_SEND		/* class can send */
71 };
72 
73 /* interior & leaf nodes; props specific to leaves are marked L: */
74 struct htb_class {
75 	struct Qdisc_class_common common;
76 	/* general class parameters */
77 	struct gnet_stats_basic_packed bstats;
78 	struct gnet_stats_queue qstats;
79 	struct gnet_stats_rate_est rate_est;
80 	struct tc_htb_xstats xstats;	/* our special stats */
81 	int refcnt;		/* usage count of this class */
82 
83 	/* topology */
84 	int level;		/* our level (see above) */
85 	unsigned int children;
86 	struct htb_class *parent;	/* parent class */
87 
88 	int prio;		/* these two are used only by leaves... */
89 	int quantum;		/* but stored for parent-to-leaf return */
90 
91 	union {
92 		struct htb_class_leaf {
93 			struct Qdisc *q;
94 			int deficit[TC_HTB_MAXDEPTH];
95 			struct list_head drop_list;
96 		} leaf;
97 		struct htb_class_inner {
98 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
99 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
100 			/* When class changes from state 1->2 and disconnects from
101 			   parent's feed then we lost ptr value and start from the
102 			   first child again. Here we store classid of the
103 			   last valid ptr (used when ptr is NULL). */
104 			u32 last_ptr_id[TC_HTB_NUMPRIO];
105 		} inner;
106 	} un;
107 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
108 	struct rb_node pq_node;	/* node for event queue */
109 	psched_time_t pq_key;
110 
111 	int prio_activity;	/* for which prios are we active */
112 	enum htb_cmode cmode;	/* current mode of the class */
113 
114 	/* class attached filters */
115 	struct tcf_proto *filter_list;
116 	int filter_cnt;
117 
118 	/* token bucket parameters */
119 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
120 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
121 	long buffer, cbuffer;	/* token bucket depth/rate */
122 	psched_tdiff_t mbuffer;	/* max wait time */
123 	long tokens, ctokens;	/* current number of tokens */
124 	psched_time_t t_c;	/* checkpoint time */
125 };
126 
127 struct htb_sched {
128 	struct Qdisc_class_hash clhash;
129 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
130 
131 	/* self list - roots of self generating tree */
132 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
133 	int row_mask[TC_HTB_MAXDEPTH];
134 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
135 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
136 
137 	/* self wait list - roots of wait PQs per row */
138 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
139 
140 	/* time of nearest event per level (row) */
141 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
142 
143 	int defcls;		/* class where unclassified flows go to */
144 
145 	/* filters for qdisc itself */
146 	struct tcf_proto *filter_list;
147 
148 	int rate2quantum;	/* quant = rate / rate2quantum */
149 	psched_time_t now;	/* cached dequeue time */
150 	struct qdisc_watchdog watchdog;
151 
152 	/* non shaped skbs; let them go directly thru */
153 	struct sk_buff_head direct_queue;
154 	int direct_qlen;	/* max qlen of above */
155 
156 	long direct_pkts;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int warned;	/* only one warning */
160 	struct work_struct work;
161 };
162 
163 /* find class in global hash table using given handle */
164 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
165 {
166 	struct htb_sched *q = qdisc_priv(sch);
167 	struct Qdisc_class_common *clc;
168 
169 	clc = qdisc_class_find(&q->clhash, handle);
170 	if (clc == NULL)
171 		return NULL;
172 	return container_of(clc, struct htb_class, common);
173 }
174 
175 /**
176  * htb_classify - classify a packet into class
177  *
178  * It returns NULL if the packet should be dropped or -1 if the packet
179  * should be passed directly thru. In all other cases leaf class is returned.
180  * We allow direct class selection by classid in priority. The we examine
181  * filters in qdisc and in inner nodes (if higher filter points to the inner
182  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
183  * internal fifo (direct). These packets then go directly thru. If we still
184  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
185  * then finish and return direct queue.
186  */
187 #define HTB_DIRECT (struct htb_class*)-1
188 
189 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
190 				      int *qerr)
191 {
192 	struct htb_sched *q = qdisc_priv(sch);
193 	struct htb_class *cl;
194 	struct tcf_result res;
195 	struct tcf_proto *tcf;
196 	int result;
197 
198 	/* allow to select class by setting skb->priority to valid classid;
199 	   note that nfmark can be used too by attaching filter fw with no
200 	   rules in it */
201 	if (skb->priority == sch->handle)
202 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
203 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
204 		return cl;
205 
206 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
207 	tcf = q->filter_list;
208 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
209 #ifdef CONFIG_NET_CLS_ACT
210 		switch (result) {
211 		case TC_ACT_QUEUED:
212 		case TC_ACT_STOLEN:
213 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
214 		case TC_ACT_SHOT:
215 			return NULL;
216 		}
217 #endif
218 		if ((cl = (void *)res.class) == NULL) {
219 			if (res.classid == sch->handle)
220 				return HTB_DIRECT;	/* X:0 (direct flow) */
221 			if ((cl = htb_find(res.classid, sch)) == NULL)
222 				break;	/* filter selected invalid classid */
223 		}
224 		if (!cl->level)
225 			return cl;	/* we hit leaf; return it */
226 
227 		/* we have got inner class; apply inner filter chain */
228 		tcf = cl->filter_list;
229 	}
230 	/* classification failed; try to use default class */
231 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
232 	if (!cl || cl->level)
233 		return HTB_DIRECT;	/* bad default .. this is safe bet */
234 	return cl;
235 }
236 
237 /**
238  * htb_add_to_id_tree - adds class to the round robin list
239  *
240  * Routine adds class to the list (actually tree) sorted by classid.
241  * Make sure that class is not already on such list for given prio.
242  */
243 static void htb_add_to_id_tree(struct rb_root *root,
244 			       struct htb_class *cl, int prio)
245 {
246 	struct rb_node **p = &root->rb_node, *parent = NULL;
247 
248 	while (*p) {
249 		struct htb_class *c;
250 		parent = *p;
251 		c = rb_entry(parent, struct htb_class, node[prio]);
252 
253 		if (cl->common.classid > c->common.classid)
254 			p = &parent->rb_right;
255 		else
256 			p = &parent->rb_left;
257 	}
258 	rb_link_node(&cl->node[prio], parent, p);
259 	rb_insert_color(&cl->node[prio], root);
260 }
261 
262 /**
263  * htb_add_to_wait_tree - adds class to the event queue with delay
264  *
265  * The class is added to priority event queue to indicate that class will
266  * change its mode in cl->pq_key microseconds. Make sure that class is not
267  * already in the queue.
268  */
269 static void htb_add_to_wait_tree(struct htb_sched *q,
270 				 struct htb_class *cl, long delay)
271 {
272 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
273 
274 	cl->pq_key = q->now + delay;
275 	if (cl->pq_key == q->now)
276 		cl->pq_key++;
277 
278 	/* update the nearest event cache */
279 	if (q->near_ev_cache[cl->level] > cl->pq_key)
280 		q->near_ev_cache[cl->level] = cl->pq_key;
281 
282 	while (*p) {
283 		struct htb_class *c;
284 		parent = *p;
285 		c = rb_entry(parent, struct htb_class, pq_node);
286 		if (cl->pq_key >= c->pq_key)
287 			p = &parent->rb_right;
288 		else
289 			p = &parent->rb_left;
290 	}
291 	rb_link_node(&cl->pq_node, parent, p);
292 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
293 }
294 
295 /**
296  * htb_next_rb_node - finds next node in binary tree
297  *
298  * When we are past last key we return NULL.
299  * Average complexity is 2 steps per call.
300  */
301 static inline void htb_next_rb_node(struct rb_node **n)
302 {
303 	*n = rb_next(*n);
304 }
305 
306 /**
307  * htb_add_class_to_row - add class to its row
308  *
309  * The class is added to row at priorities marked in mask.
310  * It does nothing if mask == 0.
311  */
312 static inline void htb_add_class_to_row(struct htb_sched *q,
313 					struct htb_class *cl, int mask)
314 {
315 	q->row_mask[cl->level] |= mask;
316 	while (mask) {
317 		int prio = ffz(~mask);
318 		mask &= ~(1 << prio);
319 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
320 	}
321 }
322 
323 /* If this triggers, it is a bug in this code, but it need not be fatal */
324 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
325 {
326 	if (RB_EMPTY_NODE(rb)) {
327 		WARN_ON(1);
328 	} else {
329 		rb_erase(rb, root);
330 		RB_CLEAR_NODE(rb);
331 	}
332 }
333 
334 
335 /**
336  * htb_remove_class_from_row - removes class from its row
337  *
338  * The class is removed from row at priorities marked in mask.
339  * It does nothing if mask == 0.
340  */
341 static inline void htb_remove_class_from_row(struct htb_sched *q,
342 						 struct htb_class *cl, int mask)
343 {
344 	int m = 0;
345 
346 	while (mask) {
347 		int prio = ffz(~mask);
348 
349 		mask &= ~(1 << prio);
350 		if (q->ptr[cl->level][prio] == cl->node + prio)
351 			htb_next_rb_node(q->ptr[cl->level] + prio);
352 
353 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
354 		if (!q->row[cl->level][prio].rb_node)
355 			m |= 1 << prio;
356 	}
357 	q->row_mask[cl->level] &= ~m;
358 }
359 
360 /**
361  * htb_activate_prios - creates active classe's feed chain
362  *
363  * The class is connected to ancestors and/or appropriate rows
364  * for priorities it is participating on. cl->cmode must be new
365  * (activated) mode. It does nothing if cl->prio_activity == 0.
366  */
367 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
368 {
369 	struct htb_class *p = cl->parent;
370 	long m, mask = cl->prio_activity;
371 
372 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
373 		m = mask;
374 		while (m) {
375 			int prio = ffz(~m);
376 			m &= ~(1 << prio);
377 
378 			if (p->un.inner.feed[prio].rb_node)
379 				/* parent already has its feed in use so that
380 				   reset bit in mask as parent is already ok */
381 				mask &= ~(1 << prio);
382 
383 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
384 		}
385 		p->prio_activity |= mask;
386 		cl = p;
387 		p = cl->parent;
388 
389 	}
390 	if (cl->cmode == HTB_CAN_SEND && mask)
391 		htb_add_class_to_row(q, cl, mask);
392 }
393 
394 /**
395  * htb_deactivate_prios - remove class from feed chain
396  *
397  * cl->cmode must represent old mode (before deactivation). It does
398  * nothing if cl->prio_activity == 0. Class is removed from all feed
399  * chains and rows.
400  */
401 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
402 {
403 	struct htb_class *p = cl->parent;
404 	long m, mask = cl->prio_activity;
405 
406 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
407 		m = mask;
408 		mask = 0;
409 		while (m) {
410 			int prio = ffz(~m);
411 			m &= ~(1 << prio);
412 
413 			if (p->un.inner.ptr[prio] == cl->node + prio) {
414 				/* we are removing child which is pointed to from
415 				   parent feed - forget the pointer but remember
416 				   classid */
417 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
418 				p->un.inner.ptr[prio] = NULL;
419 			}
420 
421 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
422 
423 			if (!p->un.inner.feed[prio].rb_node)
424 				mask |= 1 << prio;
425 		}
426 
427 		p->prio_activity &= ~mask;
428 		cl = p;
429 		p = cl->parent;
430 
431 	}
432 	if (cl->cmode == HTB_CAN_SEND && mask)
433 		htb_remove_class_from_row(q, cl, mask);
434 }
435 
436 static inline long htb_lowater(const struct htb_class *cl)
437 {
438 	if (htb_hysteresis)
439 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
440 	else
441 		return 0;
442 }
443 static inline long htb_hiwater(const struct htb_class *cl)
444 {
445 	if (htb_hysteresis)
446 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
447 	else
448 		return 0;
449 }
450 
451 
452 /**
453  * htb_class_mode - computes and returns current class mode
454  *
455  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
456  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
457  * from now to time when cl will change its state.
458  * Also it is worth to note that class mode doesn't change simply
459  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
460  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
461  * mode transitions per time unit. The speed gain is about 1/6.
462  */
463 static inline enum htb_cmode
464 htb_class_mode(struct htb_class *cl, long *diff)
465 {
466 	long toks;
467 
468 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
469 		*diff = -toks;
470 		return HTB_CANT_SEND;
471 	}
472 
473 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
474 		return HTB_CAN_SEND;
475 
476 	*diff = -toks;
477 	return HTB_MAY_BORROW;
478 }
479 
480 /**
481  * htb_change_class_mode - changes classe's mode
482  *
483  * This should be the only way how to change classe's mode under normal
484  * cirsumstances. Routine will update feed lists linkage, change mode
485  * and add class to the wait event queue if appropriate. New mode should
486  * be different from old one and cl->pq_key has to be valid if changing
487  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
488  */
489 static void
490 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
491 {
492 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
493 
494 	if (new_mode == cl->cmode)
495 		return;
496 
497 	if (cl->prio_activity) {	/* not necessary: speed optimization */
498 		if (cl->cmode != HTB_CANT_SEND)
499 			htb_deactivate_prios(q, cl);
500 		cl->cmode = new_mode;
501 		if (new_mode != HTB_CANT_SEND)
502 			htb_activate_prios(q, cl);
503 	} else
504 		cl->cmode = new_mode;
505 }
506 
507 /**
508  * htb_activate - inserts leaf cl into appropriate active feeds
509  *
510  * Routine learns (new) priority of leaf and activates feed chain
511  * for the prio. It can be called on already active leaf safely.
512  * It also adds leaf into droplist.
513  */
514 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
515 {
516 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
517 
518 	if (!cl->prio_activity) {
519 		cl->prio_activity = 1 << cl->prio;
520 		htb_activate_prios(q, cl);
521 		list_add_tail(&cl->un.leaf.drop_list,
522 			      q->drops + cl->prio);
523 	}
524 }
525 
526 /**
527  * htb_deactivate - remove leaf cl from active feeds
528  *
529  * Make sure that leaf is active. In the other words it can't be called
530  * with non-active leaf. It also removes class from the drop list.
531  */
532 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
533 {
534 	WARN_ON(!cl->prio_activity);
535 
536 	htb_deactivate_prios(q, cl);
537 	cl->prio_activity = 0;
538 	list_del_init(&cl->un.leaf.drop_list);
539 }
540 
541 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
542 {
543 	int uninitialized_var(ret);
544 	struct htb_sched *q = qdisc_priv(sch);
545 	struct htb_class *cl = htb_classify(skb, sch, &ret);
546 
547 	if (cl == HTB_DIRECT) {
548 		/* enqueue to helper queue */
549 		if (q->direct_queue.qlen < q->direct_qlen) {
550 			__skb_queue_tail(&q->direct_queue, skb);
551 			q->direct_pkts++;
552 		} else {
553 			kfree_skb(skb);
554 			sch->qstats.drops++;
555 			return NET_XMIT_DROP;
556 		}
557 #ifdef CONFIG_NET_CLS_ACT
558 	} else if (!cl) {
559 		if (ret & __NET_XMIT_BYPASS)
560 			sch->qstats.drops++;
561 		kfree_skb(skb);
562 		return ret;
563 #endif
564 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
565 		if (net_xmit_drop_count(ret)) {
566 			sch->qstats.drops++;
567 			cl->qstats.drops++;
568 		}
569 		return ret;
570 	} else {
571 		cl->bstats.packets +=
572 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
573 		cl->bstats.bytes += qdisc_pkt_len(skb);
574 		htb_activate(q, cl);
575 	}
576 
577 	sch->q.qlen++;
578 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
579 	sch->bstats.bytes += qdisc_pkt_len(skb);
580 	return NET_XMIT_SUCCESS;
581 }
582 
583 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
584 {
585 	long toks = diff + cl->tokens;
586 
587 	if (toks > cl->buffer)
588 		toks = cl->buffer;
589 	toks -= (long) qdisc_l2t(cl->rate, bytes);
590 	if (toks <= -cl->mbuffer)
591 		toks = 1 - cl->mbuffer;
592 
593 	cl->tokens = toks;
594 }
595 
596 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
597 {
598 	long toks = diff + cl->ctokens;
599 
600 	if (toks > cl->cbuffer)
601 		toks = cl->cbuffer;
602 	toks -= (long) qdisc_l2t(cl->ceil, bytes);
603 	if (toks <= -cl->mbuffer)
604 		toks = 1 - cl->mbuffer;
605 
606 	cl->ctokens = toks;
607 }
608 
609 /**
610  * htb_charge_class - charges amount "bytes" to leaf and ancestors
611  *
612  * Routine assumes that packet "bytes" long was dequeued from leaf cl
613  * borrowing from "level". It accounts bytes to ceil leaky bucket for
614  * leaf and all ancestors and to rate bucket for ancestors at levels
615  * "level" and higher. It also handles possible change of mode resulting
616  * from the update. Note that mode can also increase here (MAY_BORROW to
617  * CAN_SEND) because we can use more precise clock that event queue here.
618  * In such case we remove class from event queue first.
619  */
620 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
621 			     int level, struct sk_buff *skb)
622 {
623 	int bytes = qdisc_pkt_len(skb);
624 	enum htb_cmode old_mode;
625 	long diff;
626 
627 	while (cl) {
628 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
629 		if (cl->level >= level) {
630 			if (cl->level == level)
631 				cl->xstats.lends++;
632 			htb_accnt_tokens(cl, bytes, diff);
633 		} else {
634 			cl->xstats.borrows++;
635 			cl->tokens += diff;	/* we moved t_c; update tokens */
636 		}
637 		htb_accnt_ctokens(cl, bytes, diff);
638 		cl->t_c = q->now;
639 
640 		old_mode = cl->cmode;
641 		diff = 0;
642 		htb_change_class_mode(q, cl, &diff);
643 		if (old_mode != cl->cmode) {
644 			if (old_mode != HTB_CAN_SEND)
645 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
646 			if (cl->cmode != HTB_CAN_SEND)
647 				htb_add_to_wait_tree(q, cl, diff);
648 		}
649 
650 		/* update byte stats except for leaves which are already updated */
651 		if (cl->level) {
652 			cl->bstats.bytes += bytes;
653 			cl->bstats.packets += skb_is_gso(skb)?
654 					skb_shinfo(skb)->gso_segs:1;
655 		}
656 		cl = cl->parent;
657 	}
658 }
659 
660 /**
661  * htb_do_events - make mode changes to classes at the level
662  *
663  * Scans event queue for pending events and applies them. Returns time of
664  * next pending event (0 for no event in pq, q->now for too many events).
665  * Note: Applied are events whose have cl->pq_key <= q->now.
666  */
667 static psched_time_t htb_do_events(struct htb_sched *q, int level,
668 				   unsigned long start)
669 {
670 	/* don't run for longer than 2 jiffies; 2 is used instead of
671 	   1 to simplify things when jiffy is going to be incremented
672 	   too soon */
673 	unsigned long stop_at = start + 2;
674 	while (time_before(jiffies, stop_at)) {
675 		struct htb_class *cl;
676 		long diff;
677 		struct rb_node *p = rb_first(&q->wait_pq[level]);
678 
679 		if (!p)
680 			return 0;
681 
682 		cl = rb_entry(p, struct htb_class, pq_node);
683 		if (cl->pq_key > q->now)
684 			return cl->pq_key;
685 
686 		htb_safe_rb_erase(p, q->wait_pq + level);
687 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
688 		htb_change_class_mode(q, cl, &diff);
689 		if (cl->cmode != HTB_CAN_SEND)
690 			htb_add_to_wait_tree(q, cl, diff);
691 	}
692 
693 	/* too much load - let's continue after a break for scheduling */
694 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
695 		printk(KERN_WARNING "htb: too many events!\n");
696 		q->warned |= HTB_WARN_TOOMANYEVENTS;
697 	}
698 
699 	return q->now;
700 }
701 
702 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
703    is no such one exists. */
704 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
705 					      u32 id)
706 {
707 	struct rb_node *r = NULL;
708 	while (n) {
709 		struct htb_class *cl =
710 		    rb_entry(n, struct htb_class, node[prio]);
711 
712 		if (id > cl->common.classid) {
713 			n = n->rb_right;
714 		} else if (id < cl->common.classid) {
715 			r = n;
716 			n = n->rb_left;
717 		} else {
718 			return n;
719 		}
720 	}
721 	return r;
722 }
723 
724 /**
725  * htb_lookup_leaf - returns next leaf class in DRR order
726  *
727  * Find leaf where current feed pointers points to.
728  */
729 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
730 					 struct rb_node **pptr, u32 * pid)
731 {
732 	int i;
733 	struct {
734 		struct rb_node *root;
735 		struct rb_node **pptr;
736 		u32 *pid;
737 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
738 
739 	BUG_ON(!tree->rb_node);
740 	sp->root = tree->rb_node;
741 	sp->pptr = pptr;
742 	sp->pid = pid;
743 
744 	for (i = 0; i < 65535; i++) {
745 		if (!*sp->pptr && *sp->pid) {
746 			/* ptr was invalidated but id is valid - try to recover
747 			   the original or next ptr */
748 			*sp->pptr =
749 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
750 		}
751 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
752 				   can become out of date quickly */
753 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
754 			*sp->pptr = sp->root;
755 			while ((*sp->pptr)->rb_left)
756 				*sp->pptr = (*sp->pptr)->rb_left;
757 			if (sp > stk) {
758 				sp--;
759 				if (!*sp->pptr) {
760 					WARN_ON(1);
761 					return NULL;
762 				}
763 				htb_next_rb_node(sp->pptr);
764 			}
765 		} else {
766 			struct htb_class *cl;
767 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
768 			if (!cl->level)
769 				return cl;
770 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
771 			sp->pptr = cl->un.inner.ptr + prio;
772 			sp->pid = cl->un.inner.last_ptr_id + prio;
773 		}
774 	}
775 	WARN_ON(1);
776 	return NULL;
777 }
778 
779 /* dequeues packet at given priority and level; call only if
780    you are sure that there is active class at prio/level */
781 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
782 					int level)
783 {
784 	struct sk_buff *skb = NULL;
785 	struct htb_class *cl, *start;
786 	/* look initial class up in the row */
787 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
788 				     q->ptr[level] + prio,
789 				     q->last_ptr_id[level] + prio);
790 
791 	do {
792 next:
793 		if (unlikely(!cl))
794 			return NULL;
795 
796 		/* class can be empty - it is unlikely but can be true if leaf
797 		   qdisc drops packets in enqueue routine or if someone used
798 		   graft operation on the leaf since last dequeue;
799 		   simply deactivate and skip such class */
800 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
801 			struct htb_class *next;
802 			htb_deactivate(q, cl);
803 
804 			/* row/level might become empty */
805 			if ((q->row_mask[level] & (1 << prio)) == 0)
806 				return NULL;
807 
808 			next = htb_lookup_leaf(q->row[level] + prio,
809 					       prio, q->ptr[level] + prio,
810 					       q->last_ptr_id[level] + prio);
811 
812 			if (cl == start)	/* fix start if we just deleted it */
813 				start = next;
814 			cl = next;
815 			goto next;
816 		}
817 
818 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
819 		if (likely(skb != NULL))
820 			break;
821 
822 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
823 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
824 				  ptr[0]) + prio);
825 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
826 				     q->ptr[level] + prio,
827 				     q->last_ptr_id[level] + prio);
828 
829 	} while (cl != start);
830 
831 	if (likely(skb != NULL)) {
832 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
833 		if (cl->un.leaf.deficit[level] < 0) {
834 			cl->un.leaf.deficit[level] += cl->quantum;
835 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
836 					  ptr[0]) + prio);
837 		}
838 		/* this used to be after charge_class but this constelation
839 		   gives us slightly better performance */
840 		if (!cl->un.leaf.q->q.qlen)
841 			htb_deactivate(q, cl);
842 		htb_charge_class(q, cl, level, skb);
843 	}
844 	return skb;
845 }
846 
847 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
848 {
849 	struct sk_buff *skb = NULL;
850 	struct htb_sched *q = qdisc_priv(sch);
851 	int level;
852 	psched_time_t next_event;
853 	unsigned long start_at;
854 
855 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
856 	skb = __skb_dequeue(&q->direct_queue);
857 	if (skb != NULL) {
858 		sch->flags &= ~TCQ_F_THROTTLED;
859 		sch->q.qlen--;
860 		return skb;
861 	}
862 
863 	if (!sch->q.qlen)
864 		goto fin;
865 	q->now = psched_get_time();
866 	start_at = jiffies;
867 
868 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
869 
870 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
871 		/* common case optimization - skip event handler quickly */
872 		int m;
873 		psched_time_t event;
874 
875 		if (q->now >= q->near_ev_cache[level]) {
876 			event = htb_do_events(q, level, start_at);
877 			if (!event)
878 				event = q->now + PSCHED_TICKS_PER_SEC;
879 			q->near_ev_cache[level] = event;
880 		} else
881 			event = q->near_ev_cache[level];
882 
883 		if (next_event > event)
884 			next_event = event;
885 
886 		m = ~q->row_mask[level];
887 		while (m != (int)(-1)) {
888 			int prio = ffz(m);
889 			m |= 1 << prio;
890 			skb = htb_dequeue_tree(q, prio, level);
891 			if (likely(skb != NULL)) {
892 				sch->q.qlen--;
893 				sch->flags &= ~TCQ_F_THROTTLED;
894 				goto fin;
895 			}
896 		}
897 	}
898 	sch->qstats.overlimits++;
899 	if (likely(next_event > q->now))
900 		qdisc_watchdog_schedule(&q->watchdog, next_event);
901 	else
902 		schedule_work(&q->work);
903 fin:
904 	return skb;
905 }
906 
907 /* try to drop from each class (by prio) until one succeed */
908 static unsigned int htb_drop(struct Qdisc *sch)
909 {
910 	struct htb_sched *q = qdisc_priv(sch);
911 	int prio;
912 
913 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
914 		struct list_head *p;
915 		list_for_each(p, q->drops + prio) {
916 			struct htb_class *cl = list_entry(p, struct htb_class,
917 							  un.leaf.drop_list);
918 			unsigned int len;
919 			if (cl->un.leaf.q->ops->drop &&
920 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
921 				sch->q.qlen--;
922 				if (!cl->un.leaf.q->q.qlen)
923 					htb_deactivate(q, cl);
924 				return len;
925 			}
926 		}
927 	}
928 	return 0;
929 }
930 
931 /* reset all classes */
932 /* always caled under BH & queue lock */
933 static void htb_reset(struct Qdisc *sch)
934 {
935 	struct htb_sched *q = qdisc_priv(sch);
936 	struct htb_class *cl;
937 	struct hlist_node *n;
938 	unsigned int i;
939 
940 	for (i = 0; i < q->clhash.hashsize; i++) {
941 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
942 			if (cl->level)
943 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
944 			else {
945 				if (cl->un.leaf.q)
946 					qdisc_reset(cl->un.leaf.q);
947 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
948 			}
949 			cl->prio_activity = 0;
950 			cl->cmode = HTB_CAN_SEND;
951 
952 		}
953 	}
954 	qdisc_watchdog_cancel(&q->watchdog);
955 	__skb_queue_purge(&q->direct_queue);
956 	sch->q.qlen = 0;
957 	memset(q->row, 0, sizeof(q->row));
958 	memset(q->row_mask, 0, sizeof(q->row_mask));
959 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
960 	memset(q->ptr, 0, sizeof(q->ptr));
961 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
962 		INIT_LIST_HEAD(q->drops + i);
963 }
964 
965 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
966 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
967 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
968 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
969 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
970 };
971 
972 static void htb_work_func(struct work_struct *work)
973 {
974 	struct htb_sched *q = container_of(work, struct htb_sched, work);
975 	struct Qdisc *sch = q->watchdog.qdisc;
976 
977 	__netif_schedule(qdisc_root(sch));
978 }
979 
980 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
981 {
982 	struct htb_sched *q = qdisc_priv(sch);
983 	struct nlattr *tb[TCA_HTB_INIT + 1];
984 	struct tc_htb_glob *gopt;
985 	int err;
986 	int i;
987 
988 	if (!opt)
989 		return -EINVAL;
990 
991 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
992 	if (err < 0)
993 		return err;
994 
995 	if (tb[TCA_HTB_INIT] == NULL) {
996 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
997 		return -EINVAL;
998 	}
999 	gopt = nla_data(tb[TCA_HTB_INIT]);
1000 	if (gopt->version != HTB_VER >> 16) {
1001 		printk(KERN_ERR
1002 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1003 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1004 		return -EINVAL;
1005 	}
1006 
1007 	err = qdisc_class_hash_init(&q->clhash);
1008 	if (err < 0)
1009 		return err;
1010 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1011 		INIT_LIST_HEAD(q->drops + i);
1012 
1013 	qdisc_watchdog_init(&q->watchdog, sch);
1014 	INIT_WORK(&q->work, htb_work_func);
1015 	skb_queue_head_init(&q->direct_queue);
1016 
1017 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1018 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1019 		q->direct_qlen = 2;
1020 
1021 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1022 		q->rate2quantum = 1;
1023 	q->defcls = gopt->defcls;
1024 
1025 	return 0;
1026 }
1027 
1028 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1029 {
1030 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1031 	struct htb_sched *q = qdisc_priv(sch);
1032 	struct nlattr *nest;
1033 	struct tc_htb_glob gopt;
1034 
1035 	spin_lock_bh(root_lock);
1036 
1037 	gopt.direct_pkts = q->direct_pkts;
1038 	gopt.version = HTB_VER;
1039 	gopt.rate2quantum = q->rate2quantum;
1040 	gopt.defcls = q->defcls;
1041 	gopt.debug = 0;
1042 
1043 	nest = nla_nest_start(skb, TCA_OPTIONS);
1044 	if (nest == NULL)
1045 		goto nla_put_failure;
1046 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1047 	nla_nest_end(skb, nest);
1048 
1049 	spin_unlock_bh(root_lock);
1050 	return skb->len;
1051 
1052 nla_put_failure:
1053 	spin_unlock_bh(root_lock);
1054 	nla_nest_cancel(skb, nest);
1055 	return -1;
1056 }
1057 
1058 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1059 			  struct sk_buff *skb, struct tcmsg *tcm)
1060 {
1061 	struct htb_class *cl = (struct htb_class *)arg;
1062 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1063 	struct nlattr *nest;
1064 	struct tc_htb_opt opt;
1065 
1066 	spin_lock_bh(root_lock);
1067 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1068 	tcm->tcm_handle = cl->common.classid;
1069 	if (!cl->level && cl->un.leaf.q)
1070 		tcm->tcm_info = cl->un.leaf.q->handle;
1071 
1072 	nest = nla_nest_start(skb, TCA_OPTIONS);
1073 	if (nest == NULL)
1074 		goto nla_put_failure;
1075 
1076 	memset(&opt, 0, sizeof(opt));
1077 
1078 	opt.rate = cl->rate->rate;
1079 	opt.buffer = cl->buffer;
1080 	opt.ceil = cl->ceil->rate;
1081 	opt.cbuffer = cl->cbuffer;
1082 	opt.quantum = cl->quantum;
1083 	opt.prio = cl->prio;
1084 	opt.level = cl->level;
1085 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1086 
1087 	nla_nest_end(skb, nest);
1088 	spin_unlock_bh(root_lock);
1089 	return skb->len;
1090 
1091 nla_put_failure:
1092 	spin_unlock_bh(root_lock);
1093 	nla_nest_cancel(skb, nest);
1094 	return -1;
1095 }
1096 
1097 static int
1098 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1099 {
1100 	struct htb_class *cl = (struct htb_class *)arg;
1101 
1102 	if (!cl->level && cl->un.leaf.q)
1103 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1104 	cl->xstats.tokens = cl->tokens;
1105 	cl->xstats.ctokens = cl->ctokens;
1106 
1107 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1108 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1109 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1110 		return -1;
1111 
1112 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1113 }
1114 
1115 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1116 		     struct Qdisc **old)
1117 {
1118 	struct htb_class *cl = (struct htb_class *)arg;
1119 
1120 	if (cl && !cl->level) {
1121 		if (new == NULL &&
1122 		    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1123 					     &pfifo_qdisc_ops,
1124 					     cl->common.classid))
1125 		    == NULL)
1126 			return -ENOBUFS;
1127 		sch_tree_lock(sch);
1128 		*old = cl->un.leaf.q;
1129 		cl->un.leaf.q = new;
1130 		if (*old != NULL) {
1131 			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1132 			qdisc_reset(*old);
1133 		}
1134 		sch_tree_unlock(sch);
1135 		return 0;
1136 	}
1137 	return -ENOENT;
1138 }
1139 
1140 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1141 {
1142 	struct htb_class *cl = (struct htb_class *)arg;
1143 	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1144 }
1145 
1146 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1147 {
1148 	struct htb_class *cl = (struct htb_class *)arg;
1149 
1150 	if (cl->un.leaf.q->q.qlen == 0)
1151 		htb_deactivate(qdisc_priv(sch), cl);
1152 }
1153 
1154 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1155 {
1156 	struct htb_class *cl = htb_find(classid, sch);
1157 	if (cl)
1158 		cl->refcnt++;
1159 	return (unsigned long)cl;
1160 }
1161 
1162 static inline int htb_parent_last_child(struct htb_class *cl)
1163 {
1164 	if (!cl->parent)
1165 		/* the root class */
1166 		return 0;
1167 	if (cl->parent->children > 1)
1168 		/* not the last child */
1169 		return 0;
1170 	return 1;
1171 }
1172 
1173 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1174 			       struct Qdisc *new_q)
1175 {
1176 	struct htb_class *parent = cl->parent;
1177 
1178 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1179 
1180 	if (parent->cmode != HTB_CAN_SEND)
1181 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1182 
1183 	parent->level = 0;
1184 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1185 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1186 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1187 	parent->tokens = parent->buffer;
1188 	parent->ctokens = parent->cbuffer;
1189 	parent->t_c = psched_get_time();
1190 	parent->cmode = HTB_CAN_SEND;
1191 }
1192 
1193 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1194 {
1195 	if (!cl->level) {
1196 		WARN_ON(!cl->un.leaf.q);
1197 		qdisc_destroy(cl->un.leaf.q);
1198 	}
1199 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1200 	qdisc_put_rtab(cl->rate);
1201 	qdisc_put_rtab(cl->ceil);
1202 
1203 	tcf_destroy_chain(&cl->filter_list);
1204 	kfree(cl);
1205 }
1206 
1207 static void htb_destroy(struct Qdisc *sch)
1208 {
1209 	struct htb_sched *q = qdisc_priv(sch);
1210 	struct hlist_node *n, *next;
1211 	struct htb_class *cl;
1212 	unsigned int i;
1213 
1214 	cancel_work_sync(&q->work);
1215 	qdisc_watchdog_cancel(&q->watchdog);
1216 	/* This line used to be after htb_destroy_class call below
1217 	   and surprisingly it worked in 2.4. But it must precede it
1218 	   because filter need its target class alive to be able to call
1219 	   unbind_filter on it (without Oops). */
1220 	tcf_destroy_chain(&q->filter_list);
1221 
1222 	for (i = 0; i < q->clhash.hashsize; i++) {
1223 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1224 			tcf_destroy_chain(&cl->filter_list);
1225 	}
1226 	for (i = 0; i < q->clhash.hashsize; i++) {
1227 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1228 					  common.hnode)
1229 			htb_destroy_class(sch, cl);
1230 	}
1231 	qdisc_class_hash_destroy(&q->clhash);
1232 	__skb_queue_purge(&q->direct_queue);
1233 }
1234 
1235 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1236 {
1237 	struct htb_sched *q = qdisc_priv(sch);
1238 	struct htb_class *cl = (struct htb_class *)arg;
1239 	unsigned int qlen;
1240 	struct Qdisc *new_q = NULL;
1241 	int last_child = 0;
1242 
1243 	// TODO: why don't allow to delete subtree ? references ? does
1244 	// tc subsys quarantee us that in htb_destroy it holds no class
1245 	// refs so that we can remove children safely there ?
1246 	if (cl->children || cl->filter_cnt)
1247 		return -EBUSY;
1248 
1249 	if (!cl->level && htb_parent_last_child(cl)) {
1250 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1251 					  &pfifo_qdisc_ops,
1252 					  cl->parent->common.classid);
1253 		last_child = 1;
1254 	}
1255 
1256 	sch_tree_lock(sch);
1257 
1258 	if (!cl->level) {
1259 		qlen = cl->un.leaf.q->q.qlen;
1260 		qdisc_reset(cl->un.leaf.q);
1261 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1262 	}
1263 
1264 	/* delete from hash and active; remainder in destroy_class */
1265 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1266 	if (cl->parent)
1267 		cl->parent->children--;
1268 
1269 	if (cl->prio_activity)
1270 		htb_deactivate(q, cl);
1271 
1272 	if (cl->cmode != HTB_CAN_SEND)
1273 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1274 
1275 	if (last_child)
1276 		htb_parent_to_leaf(q, cl, new_q);
1277 
1278 	BUG_ON(--cl->refcnt == 0);
1279 	/*
1280 	 * This shouldn't happen: we "hold" one cops->get() when called
1281 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1282 	 */
1283 
1284 	sch_tree_unlock(sch);
1285 	return 0;
1286 }
1287 
1288 static void htb_put(struct Qdisc *sch, unsigned long arg)
1289 {
1290 	struct htb_class *cl = (struct htb_class *)arg;
1291 
1292 	if (--cl->refcnt == 0)
1293 		htb_destroy_class(sch, cl);
1294 }
1295 
1296 static int htb_change_class(struct Qdisc *sch, u32 classid,
1297 			    u32 parentid, struct nlattr **tca,
1298 			    unsigned long *arg)
1299 {
1300 	int err = -EINVAL;
1301 	struct htb_sched *q = qdisc_priv(sch);
1302 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1303 	struct nlattr *opt = tca[TCA_OPTIONS];
1304 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1305 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1306 	struct tc_htb_opt *hopt;
1307 
1308 	/* extract all subattrs from opt attr */
1309 	if (!opt)
1310 		goto failure;
1311 
1312 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1313 	if (err < 0)
1314 		goto failure;
1315 
1316 	err = -EINVAL;
1317 	if (tb[TCA_HTB_PARMS] == NULL)
1318 		goto failure;
1319 
1320 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1321 
1322 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1323 
1324 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1325 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1326 	if (!rtab || !ctab)
1327 		goto failure;
1328 
1329 	if (!cl) {		/* new class */
1330 		struct Qdisc *new_q;
1331 		int prio;
1332 		struct {
1333 			struct nlattr		nla;
1334 			struct gnet_estimator	opt;
1335 		} est = {
1336 			.nla = {
1337 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1338 				.nla_type	= TCA_RATE,
1339 			},
1340 			.opt = {
1341 				/* 4s interval, 16s averaging constant */
1342 				.interval	= 2,
1343 				.ewma_log	= 2,
1344 			},
1345 		};
1346 
1347 		/* check for valid classid */
1348 		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1349 		    || htb_find(classid, sch))
1350 			goto failure;
1351 
1352 		/* check maximal depth */
1353 		if (parent && parent->parent && parent->parent->level < 2) {
1354 			printk(KERN_ERR "htb: tree is too deep\n");
1355 			goto failure;
1356 		}
1357 		err = -ENOBUFS;
1358 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1359 			goto failure;
1360 
1361 		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1362 					qdisc_root_sleeping_lock(sch),
1363 					tca[TCA_RATE] ? : &est.nla);
1364 		if (err) {
1365 			kfree(cl);
1366 			goto failure;
1367 		}
1368 
1369 		cl->refcnt = 1;
1370 		cl->children = 0;
1371 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1372 		RB_CLEAR_NODE(&cl->pq_node);
1373 
1374 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1375 			RB_CLEAR_NODE(&cl->node[prio]);
1376 
1377 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1378 		   so that can't be used inside of sch_tree_lock
1379 		   -- thanks to Karlis Peisenieks */
1380 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1381 					  &pfifo_qdisc_ops, classid);
1382 		sch_tree_lock(sch);
1383 		if (parent && !parent->level) {
1384 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1385 
1386 			/* turn parent into inner node */
1387 			qdisc_reset(parent->un.leaf.q);
1388 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1389 			qdisc_destroy(parent->un.leaf.q);
1390 			if (parent->prio_activity)
1391 				htb_deactivate(q, parent);
1392 
1393 			/* remove from evt list because of level change */
1394 			if (parent->cmode != HTB_CAN_SEND) {
1395 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1396 				parent->cmode = HTB_CAN_SEND;
1397 			}
1398 			parent->level = (parent->parent ? parent->parent->level
1399 					 : TC_HTB_MAXDEPTH) - 1;
1400 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1401 		}
1402 		/* leaf (we) needs elementary qdisc */
1403 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1404 
1405 		cl->common.classid = classid;
1406 		cl->parent = parent;
1407 
1408 		/* set class to be in HTB_CAN_SEND state */
1409 		cl->tokens = hopt->buffer;
1410 		cl->ctokens = hopt->cbuffer;
1411 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1412 		cl->t_c = psched_get_time();
1413 		cl->cmode = HTB_CAN_SEND;
1414 
1415 		/* attach to the hash list and parent's family */
1416 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1417 		if (parent)
1418 			parent->children++;
1419 	} else {
1420 		if (tca[TCA_RATE]) {
1421 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1422 						    qdisc_root_sleeping_lock(sch),
1423 						    tca[TCA_RATE]);
1424 			if (err)
1425 				return err;
1426 		}
1427 		sch_tree_lock(sch);
1428 	}
1429 
1430 	/* it used to be a nasty bug here, we have to check that node
1431 	   is really leaf before changing cl->un.leaf ! */
1432 	if (!cl->level) {
1433 		cl->quantum = rtab->rate.rate / q->rate2quantum;
1434 		if (!hopt->quantum && cl->quantum < 1000) {
1435 			printk(KERN_WARNING
1436 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1437 			       cl->common.classid);
1438 			cl->quantum = 1000;
1439 		}
1440 		if (!hopt->quantum && cl->quantum > 200000) {
1441 			printk(KERN_WARNING
1442 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1443 			       cl->common.classid);
1444 			cl->quantum = 200000;
1445 		}
1446 		if (hopt->quantum)
1447 			cl->quantum = hopt->quantum;
1448 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1449 			cl->prio = TC_HTB_NUMPRIO - 1;
1450 	}
1451 
1452 	cl->buffer = hopt->buffer;
1453 	cl->cbuffer = hopt->cbuffer;
1454 	if (cl->rate)
1455 		qdisc_put_rtab(cl->rate);
1456 	cl->rate = rtab;
1457 	if (cl->ceil)
1458 		qdisc_put_rtab(cl->ceil);
1459 	cl->ceil = ctab;
1460 	sch_tree_unlock(sch);
1461 
1462 	qdisc_class_hash_grow(sch, &q->clhash);
1463 
1464 	*arg = (unsigned long)cl;
1465 	return 0;
1466 
1467 failure:
1468 	if (rtab)
1469 		qdisc_put_rtab(rtab);
1470 	if (ctab)
1471 		qdisc_put_rtab(ctab);
1472 	return err;
1473 }
1474 
1475 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1476 {
1477 	struct htb_sched *q = qdisc_priv(sch);
1478 	struct htb_class *cl = (struct htb_class *)arg;
1479 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1480 
1481 	return fl;
1482 }
1483 
1484 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1485 				     u32 classid)
1486 {
1487 	struct htb_class *cl = htb_find(classid, sch);
1488 
1489 	/*if (cl && !cl->level) return 0;
1490 	   The line above used to be there to prevent attaching filters to
1491 	   leaves. But at least tc_index filter uses this just to get class
1492 	   for other reasons so that we have to allow for it.
1493 	   ----
1494 	   19.6.2002 As Werner explained it is ok - bind filter is just
1495 	   another way to "lock" the class - unlike "get" this lock can
1496 	   be broken by class during destroy IIUC.
1497 	 */
1498 	if (cl)
1499 		cl->filter_cnt++;
1500 	return (unsigned long)cl;
1501 }
1502 
1503 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1504 {
1505 	struct htb_class *cl = (struct htb_class *)arg;
1506 
1507 	if (cl)
1508 		cl->filter_cnt--;
1509 }
1510 
1511 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1512 {
1513 	struct htb_sched *q = qdisc_priv(sch);
1514 	struct htb_class *cl;
1515 	struct hlist_node *n;
1516 	unsigned int i;
1517 
1518 	if (arg->stop)
1519 		return;
1520 
1521 	for (i = 0; i < q->clhash.hashsize; i++) {
1522 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1523 			if (arg->count < arg->skip) {
1524 				arg->count++;
1525 				continue;
1526 			}
1527 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1528 				arg->stop = 1;
1529 				return;
1530 			}
1531 			arg->count++;
1532 		}
1533 	}
1534 }
1535 
1536 static const struct Qdisc_class_ops htb_class_ops = {
1537 	.graft		=	htb_graft,
1538 	.leaf		=	htb_leaf,
1539 	.qlen_notify	=	htb_qlen_notify,
1540 	.get		=	htb_get,
1541 	.put		=	htb_put,
1542 	.change		=	htb_change_class,
1543 	.delete		=	htb_delete,
1544 	.walk		=	htb_walk,
1545 	.tcf_chain	=	htb_find_tcf,
1546 	.bind_tcf	=	htb_bind_filter,
1547 	.unbind_tcf	=	htb_unbind_filter,
1548 	.dump		=	htb_dump_class,
1549 	.dump_stats	=	htb_dump_class_stats,
1550 };
1551 
1552 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1553 	.next		=	NULL,
1554 	.cl_ops		=	&htb_class_ops,
1555 	.id		=	"htb",
1556 	.priv_size	=	sizeof(struct htb_sched),
1557 	.enqueue	=	htb_enqueue,
1558 	.dequeue	=	htb_dequeue,
1559 	.peek		=	qdisc_peek_dequeued,
1560 	.drop		=	htb_drop,
1561 	.init		=	htb_init,
1562 	.reset		=	htb_reset,
1563 	.destroy	=	htb_destroy,
1564 	.change		=	NULL /* htb_change */,
1565 	.dump		=	htb_dump,
1566 	.owner		=	THIS_MODULE,
1567 };
1568 
1569 static int __init htb_module_init(void)
1570 {
1571 	return register_qdisc(&htb_qdisc_ops);
1572 }
1573 static void __exit htb_module_exit(void)
1574 {
1575 	unregister_qdisc(&htb_qdisc_ops);
1576 }
1577 
1578 module_init(htb_module_init)
1579 module_exit(htb_module_exit)
1580 MODULE_LICENSE("GPL");
1581