xref: /openbmc/linux/net/sched/sch_htb.c (revision a09d2831)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <net/netlink.h>
40 #include <net/pkt_sched.h>
41 
42 /* HTB algorithm.
43     Author: devik@cdi.cz
44     ========================================================================
45     HTB is like TBF with multiple classes. It is also similar to CBQ because
46     it allows to assign priority to each class in hierarchy.
47     In fact it is another implementation of Floyd's formal sharing.
48 
49     Levels:
50     Each class is assigned level. Leaf has ALWAYS level 0 and root
51     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
52     one less than their parent.
53 */
54 
55 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
56 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
57 
58 #if HTB_VER >> 16 != TC_HTB_PROTOVER
59 #error "Mismatched sch_htb.c and pkt_sch.h"
60 #endif
61 
62 /* Module parameter and sysfs export */
63 module_param    (htb_hysteresis, int, 0640);
64 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
65 
66 /* used internaly to keep status of single class */
67 enum htb_cmode {
68 	HTB_CANT_SEND,		/* class can't send and can't borrow */
69 	HTB_MAY_BORROW,		/* class can't send but may borrow */
70 	HTB_CAN_SEND		/* class can send */
71 };
72 
73 /* interior & leaf nodes; props specific to leaves are marked L: */
74 struct htb_class {
75 	struct Qdisc_class_common common;
76 	/* general class parameters */
77 	struct gnet_stats_basic_packed bstats;
78 	struct gnet_stats_queue qstats;
79 	struct gnet_stats_rate_est rate_est;
80 	struct tc_htb_xstats xstats;	/* our special stats */
81 	int refcnt;		/* usage count of this class */
82 
83 	/* topology */
84 	int level;		/* our level (see above) */
85 	unsigned int children;
86 	struct htb_class *parent;	/* parent class */
87 
88 	int prio;		/* these two are used only by leaves... */
89 	int quantum;		/* but stored for parent-to-leaf return */
90 
91 	union {
92 		struct htb_class_leaf {
93 			struct Qdisc *q;
94 			int deficit[TC_HTB_MAXDEPTH];
95 			struct list_head drop_list;
96 		} leaf;
97 		struct htb_class_inner {
98 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
99 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
100 			/* When class changes from state 1->2 and disconnects from
101 			   parent's feed then we lost ptr value and start from the
102 			   first child again. Here we store classid of the
103 			   last valid ptr (used when ptr is NULL). */
104 			u32 last_ptr_id[TC_HTB_NUMPRIO];
105 		} inner;
106 	} un;
107 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
108 	struct rb_node pq_node;	/* node for event queue */
109 	psched_time_t pq_key;
110 
111 	int prio_activity;	/* for which prios are we active */
112 	enum htb_cmode cmode;	/* current mode of the class */
113 
114 	/* class attached filters */
115 	struct tcf_proto *filter_list;
116 	int filter_cnt;
117 
118 	/* token bucket parameters */
119 	struct qdisc_rate_table *rate;	/* rate table of the class itself */
120 	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
121 	long buffer, cbuffer;	/* token bucket depth/rate */
122 	psched_tdiff_t mbuffer;	/* max wait time */
123 	long tokens, ctokens;	/* current number of tokens */
124 	psched_time_t t_c;	/* checkpoint time */
125 };
126 
127 struct htb_sched {
128 	struct Qdisc_class_hash clhash;
129 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
130 
131 	/* self list - roots of self generating tree */
132 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
133 	int row_mask[TC_HTB_MAXDEPTH];
134 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
135 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
136 
137 	/* self wait list - roots of wait PQs per row */
138 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
139 
140 	/* time of nearest event per level (row) */
141 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
142 
143 	int defcls;		/* class where unclassified flows go to */
144 
145 	/* filters for qdisc itself */
146 	struct tcf_proto *filter_list;
147 
148 	int rate2quantum;	/* quant = rate / rate2quantum */
149 	psched_time_t now;	/* cached dequeue time */
150 	struct qdisc_watchdog watchdog;
151 
152 	/* non shaped skbs; let them go directly thru */
153 	struct sk_buff_head direct_queue;
154 	int direct_qlen;	/* max qlen of above */
155 
156 	long direct_pkts;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int warned;	/* only one warning */
160 	struct work_struct work;
161 };
162 
163 /* find class in global hash table using given handle */
164 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
165 {
166 	struct htb_sched *q = qdisc_priv(sch);
167 	struct Qdisc_class_common *clc;
168 
169 	clc = qdisc_class_find(&q->clhash, handle);
170 	if (clc == NULL)
171 		return NULL;
172 	return container_of(clc, struct htb_class, common);
173 }
174 
175 /**
176  * htb_classify - classify a packet into class
177  *
178  * It returns NULL if the packet should be dropped or -1 if the packet
179  * should be passed directly thru. In all other cases leaf class is returned.
180  * We allow direct class selection by classid in priority. The we examine
181  * filters in qdisc and in inner nodes (if higher filter points to the inner
182  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
183  * internal fifo (direct). These packets then go directly thru. If we still
184  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
185  * then finish and return direct queue.
186  */
187 #define HTB_DIRECT (struct htb_class*)-1
188 
189 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
190 				      int *qerr)
191 {
192 	struct htb_sched *q = qdisc_priv(sch);
193 	struct htb_class *cl;
194 	struct tcf_result res;
195 	struct tcf_proto *tcf;
196 	int result;
197 
198 	/* allow to select class by setting skb->priority to valid classid;
199 	   note that nfmark can be used too by attaching filter fw with no
200 	   rules in it */
201 	if (skb->priority == sch->handle)
202 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
203 	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
204 		return cl;
205 
206 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
207 	tcf = q->filter_list;
208 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
209 #ifdef CONFIG_NET_CLS_ACT
210 		switch (result) {
211 		case TC_ACT_QUEUED:
212 		case TC_ACT_STOLEN:
213 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
214 		case TC_ACT_SHOT:
215 			return NULL;
216 		}
217 #endif
218 		if ((cl = (void *)res.class) == NULL) {
219 			if (res.classid == sch->handle)
220 				return HTB_DIRECT;	/* X:0 (direct flow) */
221 			if ((cl = htb_find(res.classid, sch)) == NULL)
222 				break;	/* filter selected invalid classid */
223 		}
224 		if (!cl->level)
225 			return cl;	/* we hit leaf; return it */
226 
227 		/* we have got inner class; apply inner filter chain */
228 		tcf = cl->filter_list;
229 	}
230 	/* classification failed; try to use default class */
231 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
232 	if (!cl || cl->level)
233 		return HTB_DIRECT;	/* bad default .. this is safe bet */
234 	return cl;
235 }
236 
237 /**
238  * htb_add_to_id_tree - adds class to the round robin list
239  *
240  * Routine adds class to the list (actually tree) sorted by classid.
241  * Make sure that class is not already on such list for given prio.
242  */
243 static void htb_add_to_id_tree(struct rb_root *root,
244 			       struct htb_class *cl, int prio)
245 {
246 	struct rb_node **p = &root->rb_node, *parent = NULL;
247 
248 	while (*p) {
249 		struct htb_class *c;
250 		parent = *p;
251 		c = rb_entry(parent, struct htb_class, node[prio]);
252 
253 		if (cl->common.classid > c->common.classid)
254 			p = &parent->rb_right;
255 		else
256 			p = &parent->rb_left;
257 	}
258 	rb_link_node(&cl->node[prio], parent, p);
259 	rb_insert_color(&cl->node[prio], root);
260 }
261 
262 /**
263  * htb_add_to_wait_tree - adds class to the event queue with delay
264  *
265  * The class is added to priority event queue to indicate that class will
266  * change its mode in cl->pq_key microseconds. Make sure that class is not
267  * already in the queue.
268  */
269 static void htb_add_to_wait_tree(struct htb_sched *q,
270 				 struct htb_class *cl, long delay)
271 {
272 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
273 
274 	cl->pq_key = q->now + delay;
275 	if (cl->pq_key == q->now)
276 		cl->pq_key++;
277 
278 	/* update the nearest event cache */
279 	if (q->near_ev_cache[cl->level] > cl->pq_key)
280 		q->near_ev_cache[cl->level] = cl->pq_key;
281 
282 	while (*p) {
283 		struct htb_class *c;
284 		parent = *p;
285 		c = rb_entry(parent, struct htb_class, pq_node);
286 		if (cl->pq_key >= c->pq_key)
287 			p = &parent->rb_right;
288 		else
289 			p = &parent->rb_left;
290 	}
291 	rb_link_node(&cl->pq_node, parent, p);
292 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
293 }
294 
295 /**
296  * htb_next_rb_node - finds next node in binary tree
297  *
298  * When we are past last key we return NULL.
299  * Average complexity is 2 steps per call.
300  */
301 static inline void htb_next_rb_node(struct rb_node **n)
302 {
303 	*n = rb_next(*n);
304 }
305 
306 /**
307  * htb_add_class_to_row - add class to its row
308  *
309  * The class is added to row at priorities marked in mask.
310  * It does nothing if mask == 0.
311  */
312 static inline void htb_add_class_to_row(struct htb_sched *q,
313 					struct htb_class *cl, int mask)
314 {
315 	q->row_mask[cl->level] |= mask;
316 	while (mask) {
317 		int prio = ffz(~mask);
318 		mask &= ~(1 << prio);
319 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
320 	}
321 }
322 
323 /* If this triggers, it is a bug in this code, but it need not be fatal */
324 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
325 {
326 	if (RB_EMPTY_NODE(rb)) {
327 		WARN_ON(1);
328 	} else {
329 		rb_erase(rb, root);
330 		RB_CLEAR_NODE(rb);
331 	}
332 }
333 
334 
335 /**
336  * htb_remove_class_from_row - removes class from its row
337  *
338  * The class is removed from row at priorities marked in mask.
339  * It does nothing if mask == 0.
340  */
341 static inline void htb_remove_class_from_row(struct htb_sched *q,
342 						 struct htb_class *cl, int mask)
343 {
344 	int m = 0;
345 
346 	while (mask) {
347 		int prio = ffz(~mask);
348 
349 		mask &= ~(1 << prio);
350 		if (q->ptr[cl->level][prio] == cl->node + prio)
351 			htb_next_rb_node(q->ptr[cl->level] + prio);
352 
353 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
354 		if (!q->row[cl->level][prio].rb_node)
355 			m |= 1 << prio;
356 	}
357 	q->row_mask[cl->level] &= ~m;
358 }
359 
360 /**
361  * htb_activate_prios - creates active classe's feed chain
362  *
363  * The class is connected to ancestors and/or appropriate rows
364  * for priorities it is participating on. cl->cmode must be new
365  * (activated) mode. It does nothing if cl->prio_activity == 0.
366  */
367 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
368 {
369 	struct htb_class *p = cl->parent;
370 	long m, mask = cl->prio_activity;
371 
372 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
373 		m = mask;
374 		while (m) {
375 			int prio = ffz(~m);
376 			m &= ~(1 << prio);
377 
378 			if (p->un.inner.feed[prio].rb_node)
379 				/* parent already has its feed in use so that
380 				   reset bit in mask as parent is already ok */
381 				mask &= ~(1 << prio);
382 
383 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
384 		}
385 		p->prio_activity |= mask;
386 		cl = p;
387 		p = cl->parent;
388 
389 	}
390 	if (cl->cmode == HTB_CAN_SEND && mask)
391 		htb_add_class_to_row(q, cl, mask);
392 }
393 
394 /**
395  * htb_deactivate_prios - remove class from feed chain
396  *
397  * cl->cmode must represent old mode (before deactivation). It does
398  * nothing if cl->prio_activity == 0. Class is removed from all feed
399  * chains and rows.
400  */
401 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
402 {
403 	struct htb_class *p = cl->parent;
404 	long m, mask = cl->prio_activity;
405 
406 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
407 		m = mask;
408 		mask = 0;
409 		while (m) {
410 			int prio = ffz(~m);
411 			m &= ~(1 << prio);
412 
413 			if (p->un.inner.ptr[prio] == cl->node + prio) {
414 				/* we are removing child which is pointed to from
415 				   parent feed - forget the pointer but remember
416 				   classid */
417 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
418 				p->un.inner.ptr[prio] = NULL;
419 			}
420 
421 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
422 
423 			if (!p->un.inner.feed[prio].rb_node)
424 				mask |= 1 << prio;
425 		}
426 
427 		p->prio_activity &= ~mask;
428 		cl = p;
429 		p = cl->parent;
430 
431 	}
432 	if (cl->cmode == HTB_CAN_SEND && mask)
433 		htb_remove_class_from_row(q, cl, mask);
434 }
435 
436 static inline long htb_lowater(const struct htb_class *cl)
437 {
438 	if (htb_hysteresis)
439 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
440 	else
441 		return 0;
442 }
443 static inline long htb_hiwater(const struct htb_class *cl)
444 {
445 	if (htb_hysteresis)
446 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
447 	else
448 		return 0;
449 }
450 
451 
452 /**
453  * htb_class_mode - computes and returns current class mode
454  *
455  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
456  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
457  * from now to time when cl will change its state.
458  * Also it is worth to note that class mode doesn't change simply
459  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
460  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
461  * mode transitions per time unit. The speed gain is about 1/6.
462  */
463 static inline enum htb_cmode
464 htb_class_mode(struct htb_class *cl, long *diff)
465 {
466 	long toks;
467 
468 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
469 		*diff = -toks;
470 		return HTB_CANT_SEND;
471 	}
472 
473 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
474 		return HTB_CAN_SEND;
475 
476 	*diff = -toks;
477 	return HTB_MAY_BORROW;
478 }
479 
480 /**
481  * htb_change_class_mode - changes classe's mode
482  *
483  * This should be the only way how to change classe's mode under normal
484  * cirsumstances. Routine will update feed lists linkage, change mode
485  * and add class to the wait event queue if appropriate. New mode should
486  * be different from old one and cl->pq_key has to be valid if changing
487  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
488  */
489 static void
490 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
491 {
492 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
493 
494 	if (new_mode == cl->cmode)
495 		return;
496 
497 	if (cl->prio_activity) {	/* not necessary: speed optimization */
498 		if (cl->cmode != HTB_CANT_SEND)
499 			htb_deactivate_prios(q, cl);
500 		cl->cmode = new_mode;
501 		if (new_mode != HTB_CANT_SEND)
502 			htb_activate_prios(q, cl);
503 	} else
504 		cl->cmode = new_mode;
505 }
506 
507 /**
508  * htb_activate - inserts leaf cl into appropriate active feeds
509  *
510  * Routine learns (new) priority of leaf and activates feed chain
511  * for the prio. It can be called on already active leaf safely.
512  * It also adds leaf into droplist.
513  */
514 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
515 {
516 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
517 
518 	if (!cl->prio_activity) {
519 		cl->prio_activity = 1 << cl->prio;
520 		htb_activate_prios(q, cl);
521 		list_add_tail(&cl->un.leaf.drop_list,
522 			      q->drops + cl->prio);
523 	}
524 }
525 
526 /**
527  * htb_deactivate - remove leaf cl from active feeds
528  *
529  * Make sure that leaf is active. In the other words it can't be called
530  * with non-active leaf. It also removes class from the drop list.
531  */
532 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
533 {
534 	WARN_ON(!cl->prio_activity);
535 
536 	htb_deactivate_prios(q, cl);
537 	cl->prio_activity = 0;
538 	list_del_init(&cl->un.leaf.drop_list);
539 }
540 
541 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
542 {
543 	int uninitialized_var(ret);
544 	struct htb_sched *q = qdisc_priv(sch);
545 	struct htb_class *cl = htb_classify(skb, sch, &ret);
546 
547 	if (cl == HTB_DIRECT) {
548 		/* enqueue to helper queue */
549 		if (q->direct_queue.qlen < q->direct_qlen) {
550 			__skb_queue_tail(&q->direct_queue, skb);
551 			q->direct_pkts++;
552 		} else {
553 			kfree_skb(skb);
554 			sch->qstats.drops++;
555 			return NET_XMIT_DROP;
556 		}
557 #ifdef CONFIG_NET_CLS_ACT
558 	} else if (!cl) {
559 		if (ret & __NET_XMIT_BYPASS)
560 			sch->qstats.drops++;
561 		kfree_skb(skb);
562 		return ret;
563 #endif
564 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
565 		if (net_xmit_drop_count(ret)) {
566 			sch->qstats.drops++;
567 			cl->qstats.drops++;
568 		}
569 		return ret;
570 	} else {
571 		cl->bstats.packets +=
572 			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
573 		cl->bstats.bytes += qdisc_pkt_len(skb);
574 		htb_activate(q, cl);
575 	}
576 
577 	sch->q.qlen++;
578 	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
579 	sch->bstats.bytes += qdisc_pkt_len(skb);
580 	return NET_XMIT_SUCCESS;
581 }
582 
583 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
584 {
585 	long toks = diff + cl->tokens;
586 
587 	if (toks > cl->buffer)
588 		toks = cl->buffer;
589 	toks -= (long) qdisc_l2t(cl->rate, bytes);
590 	if (toks <= -cl->mbuffer)
591 		toks = 1 - cl->mbuffer;
592 
593 	cl->tokens = toks;
594 }
595 
596 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
597 {
598 	long toks = diff + cl->ctokens;
599 
600 	if (toks > cl->cbuffer)
601 		toks = cl->cbuffer;
602 	toks -= (long) qdisc_l2t(cl->ceil, bytes);
603 	if (toks <= -cl->mbuffer)
604 		toks = 1 - cl->mbuffer;
605 
606 	cl->ctokens = toks;
607 }
608 
609 /**
610  * htb_charge_class - charges amount "bytes" to leaf and ancestors
611  *
612  * Routine assumes that packet "bytes" long was dequeued from leaf cl
613  * borrowing from "level". It accounts bytes to ceil leaky bucket for
614  * leaf and all ancestors and to rate bucket for ancestors at levels
615  * "level" and higher. It also handles possible change of mode resulting
616  * from the update. Note that mode can also increase here (MAY_BORROW to
617  * CAN_SEND) because we can use more precise clock that event queue here.
618  * In such case we remove class from event queue first.
619  */
620 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
621 			     int level, struct sk_buff *skb)
622 {
623 	int bytes = qdisc_pkt_len(skb);
624 	enum htb_cmode old_mode;
625 	long diff;
626 
627 	while (cl) {
628 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
629 		if (cl->level >= level) {
630 			if (cl->level == level)
631 				cl->xstats.lends++;
632 			htb_accnt_tokens(cl, bytes, diff);
633 		} else {
634 			cl->xstats.borrows++;
635 			cl->tokens += diff;	/* we moved t_c; update tokens */
636 		}
637 		htb_accnt_ctokens(cl, bytes, diff);
638 		cl->t_c = q->now;
639 
640 		old_mode = cl->cmode;
641 		diff = 0;
642 		htb_change_class_mode(q, cl, &diff);
643 		if (old_mode != cl->cmode) {
644 			if (old_mode != HTB_CAN_SEND)
645 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
646 			if (cl->cmode != HTB_CAN_SEND)
647 				htb_add_to_wait_tree(q, cl, diff);
648 		}
649 
650 		/* update byte stats except for leaves which are already updated */
651 		if (cl->level) {
652 			cl->bstats.bytes += bytes;
653 			cl->bstats.packets += skb_is_gso(skb)?
654 					skb_shinfo(skb)->gso_segs:1;
655 		}
656 		cl = cl->parent;
657 	}
658 }
659 
660 /**
661  * htb_do_events - make mode changes to classes at the level
662  *
663  * Scans event queue for pending events and applies them. Returns time of
664  * next pending event (0 for no event in pq, q->now for too many events).
665  * Note: Applied are events whose have cl->pq_key <= q->now.
666  */
667 static psched_time_t htb_do_events(struct htb_sched *q, int level,
668 				   unsigned long start)
669 {
670 	/* don't run for longer than 2 jiffies; 2 is used instead of
671 	   1 to simplify things when jiffy is going to be incremented
672 	   too soon */
673 	unsigned long stop_at = start + 2;
674 	while (time_before(jiffies, stop_at)) {
675 		struct htb_class *cl;
676 		long diff;
677 		struct rb_node *p = rb_first(&q->wait_pq[level]);
678 
679 		if (!p)
680 			return 0;
681 
682 		cl = rb_entry(p, struct htb_class, pq_node);
683 		if (cl->pq_key > q->now)
684 			return cl->pq_key;
685 
686 		htb_safe_rb_erase(p, q->wait_pq + level);
687 		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
688 		htb_change_class_mode(q, cl, &diff);
689 		if (cl->cmode != HTB_CAN_SEND)
690 			htb_add_to_wait_tree(q, cl, diff);
691 	}
692 
693 	/* too much load - let's continue after a break for scheduling */
694 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
695 		printk(KERN_WARNING "htb: too many events!\n");
696 		q->warned |= HTB_WARN_TOOMANYEVENTS;
697 	}
698 
699 	return q->now;
700 }
701 
702 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
703    is no such one exists. */
704 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
705 					      u32 id)
706 {
707 	struct rb_node *r = NULL;
708 	while (n) {
709 		struct htb_class *cl =
710 		    rb_entry(n, struct htb_class, node[prio]);
711 
712 		if (id > cl->common.classid) {
713 			n = n->rb_right;
714 		} else if (id < cl->common.classid) {
715 			r = n;
716 			n = n->rb_left;
717 		} else {
718 			return n;
719 		}
720 	}
721 	return r;
722 }
723 
724 /**
725  * htb_lookup_leaf - returns next leaf class in DRR order
726  *
727  * Find leaf where current feed pointers points to.
728  */
729 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
730 					 struct rb_node **pptr, u32 * pid)
731 {
732 	int i;
733 	struct {
734 		struct rb_node *root;
735 		struct rb_node **pptr;
736 		u32 *pid;
737 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
738 
739 	BUG_ON(!tree->rb_node);
740 	sp->root = tree->rb_node;
741 	sp->pptr = pptr;
742 	sp->pid = pid;
743 
744 	for (i = 0; i < 65535; i++) {
745 		if (!*sp->pptr && *sp->pid) {
746 			/* ptr was invalidated but id is valid - try to recover
747 			   the original or next ptr */
748 			*sp->pptr =
749 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
750 		}
751 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
752 				   can become out of date quickly */
753 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
754 			*sp->pptr = sp->root;
755 			while ((*sp->pptr)->rb_left)
756 				*sp->pptr = (*sp->pptr)->rb_left;
757 			if (sp > stk) {
758 				sp--;
759 				if (!*sp->pptr) {
760 					WARN_ON(1);
761 					return NULL;
762 				}
763 				htb_next_rb_node(sp->pptr);
764 			}
765 		} else {
766 			struct htb_class *cl;
767 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
768 			if (!cl->level)
769 				return cl;
770 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
771 			sp->pptr = cl->un.inner.ptr + prio;
772 			sp->pid = cl->un.inner.last_ptr_id + prio;
773 		}
774 	}
775 	WARN_ON(1);
776 	return NULL;
777 }
778 
779 /* dequeues packet at given priority and level; call only if
780    you are sure that there is active class at prio/level */
781 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
782 					int level)
783 {
784 	struct sk_buff *skb = NULL;
785 	struct htb_class *cl, *start;
786 	/* look initial class up in the row */
787 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
788 				     q->ptr[level] + prio,
789 				     q->last_ptr_id[level] + prio);
790 
791 	do {
792 next:
793 		if (unlikely(!cl))
794 			return NULL;
795 
796 		/* class can be empty - it is unlikely but can be true if leaf
797 		   qdisc drops packets in enqueue routine or if someone used
798 		   graft operation on the leaf since last dequeue;
799 		   simply deactivate and skip such class */
800 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
801 			struct htb_class *next;
802 			htb_deactivate(q, cl);
803 
804 			/* row/level might become empty */
805 			if ((q->row_mask[level] & (1 << prio)) == 0)
806 				return NULL;
807 
808 			next = htb_lookup_leaf(q->row[level] + prio,
809 					       prio, q->ptr[level] + prio,
810 					       q->last_ptr_id[level] + prio);
811 
812 			if (cl == start)	/* fix start if we just deleted it */
813 				start = next;
814 			cl = next;
815 			goto next;
816 		}
817 
818 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
819 		if (likely(skb != NULL))
820 			break;
821 
822 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
823 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
824 				  ptr[0]) + prio);
825 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
826 				     q->ptr[level] + prio,
827 				     q->last_ptr_id[level] + prio);
828 
829 	} while (cl != start);
830 
831 	if (likely(skb != NULL)) {
832 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
833 		if (cl->un.leaf.deficit[level] < 0) {
834 			cl->un.leaf.deficit[level] += cl->quantum;
835 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
836 					  ptr[0]) + prio);
837 		}
838 		/* this used to be after charge_class but this constelation
839 		   gives us slightly better performance */
840 		if (!cl->un.leaf.q->q.qlen)
841 			htb_deactivate(q, cl);
842 		htb_charge_class(q, cl, level, skb);
843 	}
844 	return skb;
845 }
846 
847 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
848 {
849 	struct sk_buff *skb = NULL;
850 	struct htb_sched *q = qdisc_priv(sch);
851 	int level;
852 	psched_time_t next_event;
853 	unsigned long start_at;
854 
855 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
856 	skb = __skb_dequeue(&q->direct_queue);
857 	if (skb != NULL) {
858 		sch->flags &= ~TCQ_F_THROTTLED;
859 		sch->q.qlen--;
860 		return skb;
861 	}
862 
863 	if (!sch->q.qlen)
864 		goto fin;
865 	q->now = psched_get_time();
866 	start_at = jiffies;
867 
868 	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
869 
870 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
871 		/* common case optimization - skip event handler quickly */
872 		int m;
873 		psched_time_t event;
874 
875 		if (q->now >= q->near_ev_cache[level]) {
876 			event = htb_do_events(q, level, start_at);
877 			if (!event)
878 				event = q->now + PSCHED_TICKS_PER_SEC;
879 			q->near_ev_cache[level] = event;
880 		} else
881 			event = q->near_ev_cache[level];
882 
883 		if (next_event > event)
884 			next_event = event;
885 
886 		m = ~q->row_mask[level];
887 		while (m != (int)(-1)) {
888 			int prio = ffz(m);
889 			m |= 1 << prio;
890 			skb = htb_dequeue_tree(q, prio, level);
891 			if (likely(skb != NULL)) {
892 				sch->q.qlen--;
893 				sch->flags &= ~TCQ_F_THROTTLED;
894 				goto fin;
895 			}
896 		}
897 	}
898 	sch->qstats.overlimits++;
899 	if (likely(next_event > q->now))
900 		qdisc_watchdog_schedule(&q->watchdog, next_event);
901 	else
902 		schedule_work(&q->work);
903 fin:
904 	return skb;
905 }
906 
907 /* try to drop from each class (by prio) until one succeed */
908 static unsigned int htb_drop(struct Qdisc *sch)
909 {
910 	struct htb_sched *q = qdisc_priv(sch);
911 	int prio;
912 
913 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
914 		struct list_head *p;
915 		list_for_each(p, q->drops + prio) {
916 			struct htb_class *cl = list_entry(p, struct htb_class,
917 							  un.leaf.drop_list);
918 			unsigned int len;
919 			if (cl->un.leaf.q->ops->drop &&
920 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
921 				sch->q.qlen--;
922 				if (!cl->un.leaf.q->q.qlen)
923 					htb_deactivate(q, cl);
924 				return len;
925 			}
926 		}
927 	}
928 	return 0;
929 }
930 
931 /* reset all classes */
932 /* always caled under BH & queue lock */
933 static void htb_reset(struct Qdisc *sch)
934 {
935 	struct htb_sched *q = qdisc_priv(sch);
936 	struct htb_class *cl;
937 	struct hlist_node *n;
938 	unsigned int i;
939 
940 	for (i = 0; i < q->clhash.hashsize; i++) {
941 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
942 			if (cl->level)
943 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
944 			else {
945 				if (cl->un.leaf.q)
946 					qdisc_reset(cl->un.leaf.q);
947 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
948 			}
949 			cl->prio_activity = 0;
950 			cl->cmode = HTB_CAN_SEND;
951 
952 		}
953 	}
954 	qdisc_watchdog_cancel(&q->watchdog);
955 	__skb_queue_purge(&q->direct_queue);
956 	sch->q.qlen = 0;
957 	memset(q->row, 0, sizeof(q->row));
958 	memset(q->row_mask, 0, sizeof(q->row_mask));
959 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
960 	memset(q->ptr, 0, sizeof(q->ptr));
961 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
962 		INIT_LIST_HEAD(q->drops + i);
963 }
964 
965 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
966 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
967 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
968 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
969 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
970 };
971 
972 static void htb_work_func(struct work_struct *work)
973 {
974 	struct htb_sched *q = container_of(work, struct htb_sched, work);
975 	struct Qdisc *sch = q->watchdog.qdisc;
976 
977 	__netif_schedule(qdisc_root(sch));
978 }
979 
980 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
981 {
982 	struct htb_sched *q = qdisc_priv(sch);
983 	struct nlattr *tb[TCA_HTB_INIT + 1];
984 	struct tc_htb_glob *gopt;
985 	int err;
986 	int i;
987 
988 	if (!opt)
989 		return -EINVAL;
990 
991 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
992 	if (err < 0)
993 		return err;
994 
995 	if (tb[TCA_HTB_INIT] == NULL) {
996 		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
997 		return -EINVAL;
998 	}
999 	gopt = nla_data(tb[TCA_HTB_INIT]);
1000 	if (gopt->version != HTB_VER >> 16) {
1001 		printk(KERN_ERR
1002 		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1003 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1004 		return -EINVAL;
1005 	}
1006 
1007 	err = qdisc_class_hash_init(&q->clhash);
1008 	if (err < 0)
1009 		return err;
1010 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1011 		INIT_LIST_HEAD(q->drops + i);
1012 
1013 	qdisc_watchdog_init(&q->watchdog, sch);
1014 	INIT_WORK(&q->work, htb_work_func);
1015 	skb_queue_head_init(&q->direct_queue);
1016 
1017 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1018 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1019 		q->direct_qlen = 2;
1020 
1021 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1022 		q->rate2quantum = 1;
1023 	q->defcls = gopt->defcls;
1024 
1025 	return 0;
1026 }
1027 
1028 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1029 {
1030 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1031 	struct htb_sched *q = qdisc_priv(sch);
1032 	struct nlattr *nest;
1033 	struct tc_htb_glob gopt;
1034 
1035 	spin_lock_bh(root_lock);
1036 
1037 	gopt.direct_pkts = q->direct_pkts;
1038 	gopt.version = HTB_VER;
1039 	gopt.rate2quantum = q->rate2quantum;
1040 	gopt.defcls = q->defcls;
1041 	gopt.debug = 0;
1042 
1043 	nest = nla_nest_start(skb, TCA_OPTIONS);
1044 	if (nest == NULL)
1045 		goto nla_put_failure;
1046 	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1047 	nla_nest_end(skb, nest);
1048 
1049 	spin_unlock_bh(root_lock);
1050 	return skb->len;
1051 
1052 nla_put_failure:
1053 	spin_unlock_bh(root_lock);
1054 	nla_nest_cancel(skb, nest);
1055 	return -1;
1056 }
1057 
1058 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1059 			  struct sk_buff *skb, struct tcmsg *tcm)
1060 {
1061 	struct htb_class *cl = (struct htb_class *)arg;
1062 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1063 	struct nlattr *nest;
1064 	struct tc_htb_opt opt;
1065 
1066 	spin_lock_bh(root_lock);
1067 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1068 	tcm->tcm_handle = cl->common.classid;
1069 	if (!cl->level && cl->un.leaf.q)
1070 		tcm->tcm_info = cl->un.leaf.q->handle;
1071 
1072 	nest = nla_nest_start(skb, TCA_OPTIONS);
1073 	if (nest == NULL)
1074 		goto nla_put_failure;
1075 
1076 	memset(&opt, 0, sizeof(opt));
1077 
1078 	opt.rate = cl->rate->rate;
1079 	opt.buffer = cl->buffer;
1080 	opt.ceil = cl->ceil->rate;
1081 	opt.cbuffer = cl->cbuffer;
1082 	opt.quantum = cl->quantum;
1083 	opt.prio = cl->prio;
1084 	opt.level = cl->level;
1085 	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1086 
1087 	nla_nest_end(skb, nest);
1088 	spin_unlock_bh(root_lock);
1089 	return skb->len;
1090 
1091 nla_put_failure:
1092 	spin_unlock_bh(root_lock);
1093 	nla_nest_cancel(skb, nest);
1094 	return -1;
1095 }
1096 
1097 static int
1098 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1099 {
1100 	struct htb_class *cl = (struct htb_class *)arg;
1101 
1102 	if (!cl->level && cl->un.leaf.q)
1103 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1104 	cl->xstats.tokens = cl->tokens;
1105 	cl->xstats.ctokens = cl->ctokens;
1106 
1107 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1108 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1109 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1110 		return -1;
1111 
1112 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1113 }
1114 
1115 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1116 		     struct Qdisc **old)
1117 {
1118 	struct htb_class *cl = (struct htb_class *)arg;
1119 
1120 	if (cl->level)
1121 		return -EINVAL;
1122 	if (new == NULL &&
1123 	    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1124 				     &pfifo_qdisc_ops,
1125 				     cl->common.classid)) == NULL)
1126 		return -ENOBUFS;
1127 
1128 	sch_tree_lock(sch);
1129 	*old = cl->un.leaf.q;
1130 	cl->un.leaf.q = new;
1131 	if (*old != NULL) {
1132 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1133 		qdisc_reset(*old);
1134 	}
1135 	sch_tree_unlock(sch);
1136 	return 0;
1137 }
1138 
1139 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1140 {
1141 	struct htb_class *cl = (struct htb_class *)arg;
1142 	return !cl->level ? cl->un.leaf.q : NULL;
1143 }
1144 
1145 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1146 {
1147 	struct htb_class *cl = (struct htb_class *)arg;
1148 
1149 	if (cl->un.leaf.q->q.qlen == 0)
1150 		htb_deactivate(qdisc_priv(sch), cl);
1151 }
1152 
1153 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1154 {
1155 	struct htb_class *cl = htb_find(classid, sch);
1156 	if (cl)
1157 		cl->refcnt++;
1158 	return (unsigned long)cl;
1159 }
1160 
1161 static inline int htb_parent_last_child(struct htb_class *cl)
1162 {
1163 	if (!cl->parent)
1164 		/* the root class */
1165 		return 0;
1166 	if (cl->parent->children > 1)
1167 		/* not the last child */
1168 		return 0;
1169 	return 1;
1170 }
1171 
1172 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1173 			       struct Qdisc *new_q)
1174 {
1175 	struct htb_class *parent = cl->parent;
1176 
1177 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1178 
1179 	if (parent->cmode != HTB_CAN_SEND)
1180 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1181 
1182 	parent->level = 0;
1183 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1184 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1185 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1186 	parent->tokens = parent->buffer;
1187 	parent->ctokens = parent->cbuffer;
1188 	parent->t_c = psched_get_time();
1189 	parent->cmode = HTB_CAN_SEND;
1190 }
1191 
1192 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1193 {
1194 	if (!cl->level) {
1195 		WARN_ON(!cl->un.leaf.q);
1196 		qdisc_destroy(cl->un.leaf.q);
1197 	}
1198 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1199 	qdisc_put_rtab(cl->rate);
1200 	qdisc_put_rtab(cl->ceil);
1201 
1202 	tcf_destroy_chain(&cl->filter_list);
1203 	kfree(cl);
1204 }
1205 
1206 static void htb_destroy(struct Qdisc *sch)
1207 {
1208 	struct htb_sched *q = qdisc_priv(sch);
1209 	struct hlist_node *n, *next;
1210 	struct htb_class *cl;
1211 	unsigned int i;
1212 
1213 	cancel_work_sync(&q->work);
1214 	qdisc_watchdog_cancel(&q->watchdog);
1215 	/* This line used to be after htb_destroy_class call below
1216 	   and surprisingly it worked in 2.4. But it must precede it
1217 	   because filter need its target class alive to be able to call
1218 	   unbind_filter on it (without Oops). */
1219 	tcf_destroy_chain(&q->filter_list);
1220 
1221 	for (i = 0; i < q->clhash.hashsize; i++) {
1222 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1223 			tcf_destroy_chain(&cl->filter_list);
1224 	}
1225 	for (i = 0; i < q->clhash.hashsize; i++) {
1226 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1227 					  common.hnode)
1228 			htb_destroy_class(sch, cl);
1229 	}
1230 	qdisc_class_hash_destroy(&q->clhash);
1231 	__skb_queue_purge(&q->direct_queue);
1232 }
1233 
1234 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1235 {
1236 	struct htb_sched *q = qdisc_priv(sch);
1237 	struct htb_class *cl = (struct htb_class *)arg;
1238 	unsigned int qlen;
1239 	struct Qdisc *new_q = NULL;
1240 	int last_child = 0;
1241 
1242 	// TODO: why don't allow to delete subtree ? references ? does
1243 	// tc subsys quarantee us that in htb_destroy it holds no class
1244 	// refs so that we can remove children safely there ?
1245 	if (cl->children || cl->filter_cnt)
1246 		return -EBUSY;
1247 
1248 	if (!cl->level && htb_parent_last_child(cl)) {
1249 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1250 					  &pfifo_qdisc_ops,
1251 					  cl->parent->common.classid);
1252 		last_child = 1;
1253 	}
1254 
1255 	sch_tree_lock(sch);
1256 
1257 	if (!cl->level) {
1258 		qlen = cl->un.leaf.q->q.qlen;
1259 		qdisc_reset(cl->un.leaf.q);
1260 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1261 	}
1262 
1263 	/* delete from hash and active; remainder in destroy_class */
1264 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1265 	if (cl->parent)
1266 		cl->parent->children--;
1267 
1268 	if (cl->prio_activity)
1269 		htb_deactivate(q, cl);
1270 
1271 	if (cl->cmode != HTB_CAN_SEND)
1272 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1273 
1274 	if (last_child)
1275 		htb_parent_to_leaf(q, cl, new_q);
1276 
1277 	BUG_ON(--cl->refcnt == 0);
1278 	/*
1279 	 * This shouldn't happen: we "hold" one cops->get() when called
1280 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1281 	 */
1282 
1283 	sch_tree_unlock(sch);
1284 	return 0;
1285 }
1286 
1287 static void htb_put(struct Qdisc *sch, unsigned long arg)
1288 {
1289 	struct htb_class *cl = (struct htb_class *)arg;
1290 
1291 	if (--cl->refcnt == 0)
1292 		htb_destroy_class(sch, cl);
1293 }
1294 
1295 static int htb_change_class(struct Qdisc *sch, u32 classid,
1296 			    u32 parentid, struct nlattr **tca,
1297 			    unsigned long *arg)
1298 {
1299 	int err = -EINVAL;
1300 	struct htb_sched *q = qdisc_priv(sch);
1301 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1302 	struct nlattr *opt = tca[TCA_OPTIONS];
1303 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1304 	struct nlattr *tb[TCA_HTB_RTAB + 1];
1305 	struct tc_htb_opt *hopt;
1306 
1307 	/* extract all subattrs from opt attr */
1308 	if (!opt)
1309 		goto failure;
1310 
1311 	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1312 	if (err < 0)
1313 		goto failure;
1314 
1315 	err = -EINVAL;
1316 	if (tb[TCA_HTB_PARMS] == NULL)
1317 		goto failure;
1318 
1319 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1320 
1321 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1322 
1323 	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1324 	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1325 	if (!rtab || !ctab)
1326 		goto failure;
1327 
1328 	if (!cl) {		/* new class */
1329 		struct Qdisc *new_q;
1330 		int prio;
1331 		struct {
1332 			struct nlattr		nla;
1333 			struct gnet_estimator	opt;
1334 		} est = {
1335 			.nla = {
1336 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1337 				.nla_type	= TCA_RATE,
1338 			},
1339 			.opt = {
1340 				/* 4s interval, 16s averaging constant */
1341 				.interval	= 2,
1342 				.ewma_log	= 2,
1343 			},
1344 		};
1345 
1346 		/* check for valid classid */
1347 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1348 		    htb_find(classid, sch))
1349 			goto failure;
1350 
1351 		/* check maximal depth */
1352 		if (parent && parent->parent && parent->parent->level < 2) {
1353 			printk(KERN_ERR "htb: tree is too deep\n");
1354 			goto failure;
1355 		}
1356 		err = -ENOBUFS;
1357 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1358 			goto failure;
1359 
1360 		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1361 					qdisc_root_sleeping_lock(sch),
1362 					tca[TCA_RATE] ? : &est.nla);
1363 		if (err) {
1364 			kfree(cl);
1365 			goto failure;
1366 		}
1367 
1368 		cl->refcnt = 1;
1369 		cl->children = 0;
1370 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1371 		RB_CLEAR_NODE(&cl->pq_node);
1372 
1373 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1374 			RB_CLEAR_NODE(&cl->node[prio]);
1375 
1376 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1377 		   so that can't be used inside of sch_tree_lock
1378 		   -- thanks to Karlis Peisenieks */
1379 		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1380 					  &pfifo_qdisc_ops, classid);
1381 		sch_tree_lock(sch);
1382 		if (parent && !parent->level) {
1383 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1384 
1385 			/* turn parent into inner node */
1386 			qdisc_reset(parent->un.leaf.q);
1387 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1388 			qdisc_destroy(parent->un.leaf.q);
1389 			if (parent->prio_activity)
1390 				htb_deactivate(q, parent);
1391 
1392 			/* remove from evt list because of level change */
1393 			if (parent->cmode != HTB_CAN_SEND) {
1394 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1395 				parent->cmode = HTB_CAN_SEND;
1396 			}
1397 			parent->level = (parent->parent ? parent->parent->level
1398 					 : TC_HTB_MAXDEPTH) - 1;
1399 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1400 		}
1401 		/* leaf (we) needs elementary qdisc */
1402 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1403 
1404 		cl->common.classid = classid;
1405 		cl->parent = parent;
1406 
1407 		/* set class to be in HTB_CAN_SEND state */
1408 		cl->tokens = hopt->buffer;
1409 		cl->ctokens = hopt->cbuffer;
1410 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1411 		cl->t_c = psched_get_time();
1412 		cl->cmode = HTB_CAN_SEND;
1413 
1414 		/* attach to the hash list and parent's family */
1415 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1416 		if (parent)
1417 			parent->children++;
1418 	} else {
1419 		if (tca[TCA_RATE]) {
1420 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1421 						    qdisc_root_sleeping_lock(sch),
1422 						    tca[TCA_RATE]);
1423 			if (err)
1424 				return err;
1425 		}
1426 		sch_tree_lock(sch);
1427 	}
1428 
1429 	/* it used to be a nasty bug here, we have to check that node
1430 	   is really leaf before changing cl->un.leaf ! */
1431 	if (!cl->level) {
1432 		cl->quantum = rtab->rate.rate / q->rate2quantum;
1433 		if (!hopt->quantum && cl->quantum < 1000) {
1434 			printk(KERN_WARNING
1435 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1436 			       cl->common.classid);
1437 			cl->quantum = 1000;
1438 		}
1439 		if (!hopt->quantum && cl->quantum > 200000) {
1440 			printk(KERN_WARNING
1441 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1442 			       cl->common.classid);
1443 			cl->quantum = 200000;
1444 		}
1445 		if (hopt->quantum)
1446 			cl->quantum = hopt->quantum;
1447 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1448 			cl->prio = TC_HTB_NUMPRIO - 1;
1449 	}
1450 
1451 	cl->buffer = hopt->buffer;
1452 	cl->cbuffer = hopt->cbuffer;
1453 	if (cl->rate)
1454 		qdisc_put_rtab(cl->rate);
1455 	cl->rate = rtab;
1456 	if (cl->ceil)
1457 		qdisc_put_rtab(cl->ceil);
1458 	cl->ceil = ctab;
1459 	sch_tree_unlock(sch);
1460 
1461 	qdisc_class_hash_grow(sch, &q->clhash);
1462 
1463 	*arg = (unsigned long)cl;
1464 	return 0;
1465 
1466 failure:
1467 	if (rtab)
1468 		qdisc_put_rtab(rtab);
1469 	if (ctab)
1470 		qdisc_put_rtab(ctab);
1471 	return err;
1472 }
1473 
1474 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1475 {
1476 	struct htb_sched *q = qdisc_priv(sch);
1477 	struct htb_class *cl = (struct htb_class *)arg;
1478 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1479 
1480 	return fl;
1481 }
1482 
1483 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1484 				     u32 classid)
1485 {
1486 	struct htb_class *cl = htb_find(classid, sch);
1487 
1488 	/*if (cl && !cl->level) return 0;
1489 	   The line above used to be there to prevent attaching filters to
1490 	   leaves. But at least tc_index filter uses this just to get class
1491 	   for other reasons so that we have to allow for it.
1492 	   ----
1493 	   19.6.2002 As Werner explained it is ok - bind filter is just
1494 	   another way to "lock" the class - unlike "get" this lock can
1495 	   be broken by class during destroy IIUC.
1496 	 */
1497 	if (cl)
1498 		cl->filter_cnt++;
1499 	return (unsigned long)cl;
1500 }
1501 
1502 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1503 {
1504 	struct htb_class *cl = (struct htb_class *)arg;
1505 
1506 	if (cl)
1507 		cl->filter_cnt--;
1508 }
1509 
1510 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1511 {
1512 	struct htb_sched *q = qdisc_priv(sch);
1513 	struct htb_class *cl;
1514 	struct hlist_node *n;
1515 	unsigned int i;
1516 
1517 	if (arg->stop)
1518 		return;
1519 
1520 	for (i = 0; i < q->clhash.hashsize; i++) {
1521 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1522 			if (arg->count < arg->skip) {
1523 				arg->count++;
1524 				continue;
1525 			}
1526 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1527 				arg->stop = 1;
1528 				return;
1529 			}
1530 			arg->count++;
1531 		}
1532 	}
1533 }
1534 
1535 static const struct Qdisc_class_ops htb_class_ops = {
1536 	.graft		=	htb_graft,
1537 	.leaf		=	htb_leaf,
1538 	.qlen_notify	=	htb_qlen_notify,
1539 	.get		=	htb_get,
1540 	.put		=	htb_put,
1541 	.change		=	htb_change_class,
1542 	.delete		=	htb_delete,
1543 	.walk		=	htb_walk,
1544 	.tcf_chain	=	htb_find_tcf,
1545 	.bind_tcf	=	htb_bind_filter,
1546 	.unbind_tcf	=	htb_unbind_filter,
1547 	.dump		=	htb_dump_class,
1548 	.dump_stats	=	htb_dump_class_stats,
1549 };
1550 
1551 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1552 	.next		=	NULL,
1553 	.cl_ops		=	&htb_class_ops,
1554 	.id		=	"htb",
1555 	.priv_size	=	sizeof(struct htb_sched),
1556 	.enqueue	=	htb_enqueue,
1557 	.dequeue	=	htb_dequeue,
1558 	.peek		=	qdisc_peek_dequeued,
1559 	.drop		=	htb_drop,
1560 	.init		=	htb_init,
1561 	.reset		=	htb_reset,
1562 	.destroy	=	htb_destroy,
1563 	.change		=	NULL /* htb_change */,
1564 	.dump		=	htb_dump,
1565 	.owner		=	THIS_MODULE,
1566 };
1567 
1568 static int __init htb_module_init(void)
1569 {
1570 	return register_qdisc(&htb_qdisc_ops);
1571 }
1572 static void __exit htb_module_exit(void)
1573 {
1574 	unregister_qdisc(&htb_qdisc_ops);
1575 }
1576 
1577 module_init(htb_module_init)
1578 module_exit(htb_module_exit)
1579 MODULE_LICENSE("GPL");
1580