xref: /openbmc/linux/net/sched/sch_htb.c (revision d0b73b48)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/pkt_sched.h>
42 
43 /* HTB algorithm.
44     Author: devik@cdi.cz
45     ========================================================================
46     HTB is like TBF with multiple classes. It is also similar to CBQ because
47     it allows to assign priority to each class in hierarchy.
48     In fact it is another implementation of Floyd's formal sharing.
49 
50     Levels:
51     Each class is assigned level. Leaf has ALWAYS level 0 and root
52     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
53     one less than their parent.
54 */
55 
56 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
57 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
58 
59 #if HTB_VER >> 16 != TC_HTB_PROTOVER
60 #error "Mismatched sch_htb.c and pkt_sch.h"
61 #endif
62 
63 /* Module parameter and sysfs export */
64 module_param    (htb_hysteresis, int, 0640);
65 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
66 
67 /* used internaly to keep status of single class */
68 enum htb_cmode {
69 	HTB_CANT_SEND,		/* class can't send and can't borrow */
70 	HTB_MAY_BORROW,		/* class can't send but may borrow */
71 	HTB_CAN_SEND		/* class can send */
72 };
73 
74 struct htb_rate_cfg {
75 	u64 rate_bps;
76 	u32 mult;
77 	u32 shift;
78 };
79 
80 /* interior & leaf nodes; props specific to leaves are marked L: */
81 struct htb_class {
82 	struct Qdisc_class_common common;
83 	/* general class parameters */
84 	struct gnet_stats_basic_packed bstats;
85 	struct gnet_stats_queue qstats;
86 	struct gnet_stats_rate_est rate_est;
87 	struct tc_htb_xstats xstats;	/* our special stats */
88 	int refcnt;		/* usage count of this class */
89 
90 	/* topology */
91 	int level;		/* our level (see above) */
92 	unsigned int children;
93 	struct htb_class *parent;	/* parent class */
94 
95 	int prio;		/* these two are used only by leaves... */
96 	int quantum;		/* but stored for parent-to-leaf return */
97 
98 	union {
99 		struct htb_class_leaf {
100 			struct Qdisc *q;
101 			int deficit[TC_HTB_MAXDEPTH];
102 			struct list_head drop_list;
103 		} leaf;
104 		struct htb_class_inner {
105 			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
106 			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
107 			/* When class changes from state 1->2 and disconnects from
108 			 * parent's feed then we lost ptr value and start from the
109 			 * first child again. Here we store classid of the
110 			 * last valid ptr (used when ptr is NULL).
111 			 */
112 			u32 last_ptr_id[TC_HTB_NUMPRIO];
113 		} inner;
114 	} un;
115 	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
116 	struct rb_node pq_node;	/* node for event queue */
117 	psched_time_t pq_key;
118 
119 	int prio_activity;	/* for which prios are we active */
120 	enum htb_cmode cmode;	/* current mode of the class */
121 
122 	/* class attached filters */
123 	struct tcf_proto *filter_list;
124 	int filter_cnt;
125 
126 	/* token bucket parameters */
127 	struct htb_rate_cfg rate;
128 	struct htb_rate_cfg ceil;
129 	s64 buffer, cbuffer;	/* token bucket depth/rate */
130 	psched_tdiff_t mbuffer;	/* max wait time */
131 	s64 tokens, ctokens;	/* current number of tokens */
132 	psched_time_t t_c;	/* checkpoint time */
133 };
134 
135 struct htb_sched {
136 	struct Qdisc_class_hash clhash;
137 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
138 
139 	/* self list - roots of self generating tree */
140 	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
141 	int row_mask[TC_HTB_MAXDEPTH];
142 	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
143 	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
144 
145 	/* self wait list - roots of wait PQs per row */
146 	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
147 
148 	/* time of nearest event per level (row) */
149 	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
150 
151 	int defcls;		/* class where unclassified flows go to */
152 
153 	/* filters for qdisc itself */
154 	struct tcf_proto *filter_list;
155 
156 	int rate2quantum;	/* quant = rate / rate2quantum */
157 	psched_time_t now;	/* cached dequeue time */
158 	struct qdisc_watchdog watchdog;
159 
160 	/* non shaped skbs; let them go directly thru */
161 	struct sk_buff_head direct_queue;
162 	int direct_qlen;	/* max qlen of above */
163 
164 	long direct_pkts;
165 
166 #define HTB_WARN_TOOMANYEVENTS	0x1
167 	unsigned int warned;	/* only one warning */
168 	struct work_struct work;
169 };
170 
171 static u64 l2t_ns(struct htb_rate_cfg *r, unsigned int len)
172 {
173 	return ((u64)len * r->mult) >> r->shift;
174 }
175 
176 static void htb_precompute_ratedata(struct htb_rate_cfg *r)
177 {
178 	u64 factor;
179 	u64 mult;
180 	int shift;
181 
182 	r->shift = 0;
183 	r->mult = 1;
184 	/*
185 	 * Calibrate mult, shift so that token counting is accurate
186 	 * for smallest packet size (64 bytes).  Token (time in ns) is
187 	 * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps.  It will
188 	 * work as long as the smallest packet transfer time can be
189 	 * accurately represented in nanosec.
190 	 */
191 	if (r->rate_bps > 0) {
192 		/*
193 		 * Higher shift gives better accuracy.  Find the largest
194 		 * shift such that mult fits in 32 bits.
195 		 */
196 		for (shift = 0; shift < 16; shift++) {
197 			r->shift = shift;
198 			factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
199 			mult = div64_u64(factor, r->rate_bps);
200 			if (mult > UINT_MAX)
201 				break;
202 		}
203 
204 		r->shift = shift - 1;
205 		factor = 8LLU * NSEC_PER_SEC * (1 << r->shift);
206 		r->mult = div64_u64(factor, r->rate_bps);
207 	}
208 }
209 
210 /* find class in global hash table using given handle */
211 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
212 {
213 	struct htb_sched *q = qdisc_priv(sch);
214 	struct Qdisc_class_common *clc;
215 
216 	clc = qdisc_class_find(&q->clhash, handle);
217 	if (clc == NULL)
218 		return NULL;
219 	return container_of(clc, struct htb_class, common);
220 }
221 
222 /**
223  * htb_classify - classify a packet into class
224  *
225  * It returns NULL if the packet should be dropped or -1 if the packet
226  * should be passed directly thru. In all other cases leaf class is returned.
227  * We allow direct class selection by classid in priority. The we examine
228  * filters in qdisc and in inner nodes (if higher filter points to the inner
229  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
230  * internal fifo (direct). These packets then go directly thru. If we still
231  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
232  * then finish and return direct queue.
233  */
234 #define HTB_DIRECT ((struct htb_class *)-1L)
235 
236 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
237 				      int *qerr)
238 {
239 	struct htb_sched *q = qdisc_priv(sch);
240 	struct htb_class *cl;
241 	struct tcf_result res;
242 	struct tcf_proto *tcf;
243 	int result;
244 
245 	/* allow to select class by setting skb->priority to valid classid;
246 	 * note that nfmark can be used too by attaching filter fw with no
247 	 * rules in it
248 	 */
249 	if (skb->priority == sch->handle)
250 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
251 	cl = htb_find(skb->priority, sch);
252 	if (cl && cl->level == 0)
253 		return cl;
254 
255 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
256 	tcf = q->filter_list;
257 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
258 #ifdef CONFIG_NET_CLS_ACT
259 		switch (result) {
260 		case TC_ACT_QUEUED:
261 		case TC_ACT_STOLEN:
262 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
263 		case TC_ACT_SHOT:
264 			return NULL;
265 		}
266 #endif
267 		cl = (void *)res.class;
268 		if (!cl) {
269 			if (res.classid == sch->handle)
270 				return HTB_DIRECT;	/* X:0 (direct flow) */
271 			cl = htb_find(res.classid, sch);
272 			if (!cl)
273 				break;	/* filter selected invalid classid */
274 		}
275 		if (!cl->level)
276 			return cl;	/* we hit leaf; return it */
277 
278 		/* we have got inner class; apply inner filter chain */
279 		tcf = cl->filter_list;
280 	}
281 	/* classification failed; try to use default class */
282 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
283 	if (!cl || cl->level)
284 		return HTB_DIRECT;	/* bad default .. this is safe bet */
285 	return cl;
286 }
287 
288 /**
289  * htb_add_to_id_tree - adds class to the round robin list
290  *
291  * Routine adds class to the list (actually tree) sorted by classid.
292  * Make sure that class is not already on such list for given prio.
293  */
294 static void htb_add_to_id_tree(struct rb_root *root,
295 			       struct htb_class *cl, int prio)
296 {
297 	struct rb_node **p = &root->rb_node, *parent = NULL;
298 
299 	while (*p) {
300 		struct htb_class *c;
301 		parent = *p;
302 		c = rb_entry(parent, struct htb_class, node[prio]);
303 
304 		if (cl->common.classid > c->common.classid)
305 			p = &parent->rb_right;
306 		else
307 			p = &parent->rb_left;
308 	}
309 	rb_link_node(&cl->node[prio], parent, p);
310 	rb_insert_color(&cl->node[prio], root);
311 }
312 
313 /**
314  * htb_add_to_wait_tree - adds class to the event queue with delay
315  *
316  * The class is added to priority event queue to indicate that class will
317  * change its mode in cl->pq_key microseconds. Make sure that class is not
318  * already in the queue.
319  */
320 static void htb_add_to_wait_tree(struct htb_sched *q,
321 				 struct htb_class *cl, s64 delay)
322 {
323 	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
324 
325 	cl->pq_key = q->now + delay;
326 	if (cl->pq_key == q->now)
327 		cl->pq_key++;
328 
329 	/* update the nearest event cache */
330 	if (q->near_ev_cache[cl->level] > cl->pq_key)
331 		q->near_ev_cache[cl->level] = cl->pq_key;
332 
333 	while (*p) {
334 		struct htb_class *c;
335 		parent = *p;
336 		c = rb_entry(parent, struct htb_class, pq_node);
337 		if (cl->pq_key >= c->pq_key)
338 			p = &parent->rb_right;
339 		else
340 			p = &parent->rb_left;
341 	}
342 	rb_link_node(&cl->pq_node, parent, p);
343 	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
344 }
345 
346 /**
347  * htb_next_rb_node - finds next node in binary tree
348  *
349  * When we are past last key we return NULL.
350  * Average complexity is 2 steps per call.
351  */
352 static inline void htb_next_rb_node(struct rb_node **n)
353 {
354 	*n = rb_next(*n);
355 }
356 
357 /**
358  * htb_add_class_to_row - add class to its row
359  *
360  * The class is added to row at priorities marked in mask.
361  * It does nothing if mask == 0.
362  */
363 static inline void htb_add_class_to_row(struct htb_sched *q,
364 					struct htb_class *cl, int mask)
365 {
366 	q->row_mask[cl->level] |= mask;
367 	while (mask) {
368 		int prio = ffz(~mask);
369 		mask &= ~(1 << prio);
370 		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
371 	}
372 }
373 
374 /* If this triggers, it is a bug in this code, but it need not be fatal */
375 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
376 {
377 	if (RB_EMPTY_NODE(rb)) {
378 		WARN_ON(1);
379 	} else {
380 		rb_erase(rb, root);
381 		RB_CLEAR_NODE(rb);
382 	}
383 }
384 
385 
386 /**
387  * htb_remove_class_from_row - removes class from its row
388  *
389  * The class is removed from row at priorities marked in mask.
390  * It does nothing if mask == 0.
391  */
392 static inline void htb_remove_class_from_row(struct htb_sched *q,
393 						 struct htb_class *cl, int mask)
394 {
395 	int m = 0;
396 
397 	while (mask) {
398 		int prio = ffz(~mask);
399 
400 		mask &= ~(1 << prio);
401 		if (q->ptr[cl->level][prio] == cl->node + prio)
402 			htb_next_rb_node(q->ptr[cl->level] + prio);
403 
404 		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
405 		if (!q->row[cl->level][prio].rb_node)
406 			m |= 1 << prio;
407 	}
408 	q->row_mask[cl->level] &= ~m;
409 }
410 
411 /**
412  * htb_activate_prios - creates active classe's feed chain
413  *
414  * The class is connected to ancestors and/or appropriate rows
415  * for priorities it is participating on. cl->cmode must be new
416  * (activated) mode. It does nothing if cl->prio_activity == 0.
417  */
418 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
419 {
420 	struct htb_class *p = cl->parent;
421 	long m, mask = cl->prio_activity;
422 
423 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
424 		m = mask;
425 		while (m) {
426 			int prio = ffz(~m);
427 			m &= ~(1 << prio);
428 
429 			if (p->un.inner.feed[prio].rb_node)
430 				/* parent already has its feed in use so that
431 				 * reset bit in mask as parent is already ok
432 				 */
433 				mask &= ~(1 << prio);
434 
435 			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
436 		}
437 		p->prio_activity |= mask;
438 		cl = p;
439 		p = cl->parent;
440 
441 	}
442 	if (cl->cmode == HTB_CAN_SEND && mask)
443 		htb_add_class_to_row(q, cl, mask);
444 }
445 
446 /**
447  * htb_deactivate_prios - remove class from feed chain
448  *
449  * cl->cmode must represent old mode (before deactivation). It does
450  * nothing if cl->prio_activity == 0. Class is removed from all feed
451  * chains and rows.
452  */
453 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
454 {
455 	struct htb_class *p = cl->parent;
456 	long m, mask = cl->prio_activity;
457 
458 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
459 		m = mask;
460 		mask = 0;
461 		while (m) {
462 			int prio = ffz(~m);
463 			m &= ~(1 << prio);
464 
465 			if (p->un.inner.ptr[prio] == cl->node + prio) {
466 				/* we are removing child which is pointed to from
467 				 * parent feed - forget the pointer but remember
468 				 * classid
469 				 */
470 				p->un.inner.last_ptr_id[prio] = cl->common.classid;
471 				p->un.inner.ptr[prio] = NULL;
472 			}
473 
474 			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
475 
476 			if (!p->un.inner.feed[prio].rb_node)
477 				mask |= 1 << prio;
478 		}
479 
480 		p->prio_activity &= ~mask;
481 		cl = p;
482 		p = cl->parent;
483 
484 	}
485 	if (cl->cmode == HTB_CAN_SEND && mask)
486 		htb_remove_class_from_row(q, cl, mask);
487 }
488 
489 static inline s64 htb_lowater(const struct htb_class *cl)
490 {
491 	if (htb_hysteresis)
492 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
493 	else
494 		return 0;
495 }
496 static inline s64 htb_hiwater(const struct htb_class *cl)
497 {
498 	if (htb_hysteresis)
499 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
500 	else
501 		return 0;
502 }
503 
504 
505 /**
506  * htb_class_mode - computes and returns current class mode
507  *
508  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
509  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
510  * from now to time when cl will change its state.
511  * Also it is worth to note that class mode doesn't change simply
512  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
513  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
514  * mode transitions per time unit. The speed gain is about 1/6.
515  */
516 static inline enum htb_cmode
517 htb_class_mode(struct htb_class *cl, s64 *diff)
518 {
519 	s64 toks;
520 
521 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
522 		*diff = -toks;
523 		return HTB_CANT_SEND;
524 	}
525 
526 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
527 		return HTB_CAN_SEND;
528 
529 	*diff = -toks;
530 	return HTB_MAY_BORROW;
531 }
532 
533 /**
534  * htb_change_class_mode - changes classe's mode
535  *
536  * This should be the only way how to change classe's mode under normal
537  * cirsumstances. Routine will update feed lists linkage, change mode
538  * and add class to the wait event queue if appropriate. New mode should
539  * be different from old one and cl->pq_key has to be valid if changing
540  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
541  */
542 static void
543 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
544 {
545 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
546 
547 	if (new_mode == cl->cmode)
548 		return;
549 
550 	if (cl->prio_activity) {	/* not necessary: speed optimization */
551 		if (cl->cmode != HTB_CANT_SEND)
552 			htb_deactivate_prios(q, cl);
553 		cl->cmode = new_mode;
554 		if (new_mode != HTB_CANT_SEND)
555 			htb_activate_prios(q, cl);
556 	} else
557 		cl->cmode = new_mode;
558 }
559 
560 /**
561  * htb_activate - inserts leaf cl into appropriate active feeds
562  *
563  * Routine learns (new) priority of leaf and activates feed chain
564  * for the prio. It can be called on already active leaf safely.
565  * It also adds leaf into droplist.
566  */
567 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
568 {
569 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
570 
571 	if (!cl->prio_activity) {
572 		cl->prio_activity = 1 << cl->prio;
573 		htb_activate_prios(q, cl);
574 		list_add_tail(&cl->un.leaf.drop_list,
575 			      q->drops + cl->prio);
576 	}
577 }
578 
579 /**
580  * htb_deactivate - remove leaf cl from active feeds
581  *
582  * Make sure that leaf is active. In the other words it can't be called
583  * with non-active leaf. It also removes class from the drop list.
584  */
585 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
586 {
587 	WARN_ON(!cl->prio_activity);
588 
589 	htb_deactivate_prios(q, cl);
590 	cl->prio_activity = 0;
591 	list_del_init(&cl->un.leaf.drop_list);
592 }
593 
594 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
595 {
596 	int uninitialized_var(ret);
597 	struct htb_sched *q = qdisc_priv(sch);
598 	struct htb_class *cl = htb_classify(skb, sch, &ret);
599 
600 	if (cl == HTB_DIRECT) {
601 		/* enqueue to helper queue */
602 		if (q->direct_queue.qlen < q->direct_qlen) {
603 			__skb_queue_tail(&q->direct_queue, skb);
604 			q->direct_pkts++;
605 		} else {
606 			return qdisc_drop(skb, sch);
607 		}
608 #ifdef CONFIG_NET_CLS_ACT
609 	} else if (!cl) {
610 		if (ret & __NET_XMIT_BYPASS)
611 			sch->qstats.drops++;
612 		kfree_skb(skb);
613 		return ret;
614 #endif
615 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
616 		if (net_xmit_drop_count(ret)) {
617 			sch->qstats.drops++;
618 			cl->qstats.drops++;
619 		}
620 		return ret;
621 	} else {
622 		htb_activate(q, cl);
623 	}
624 
625 	sch->q.qlen++;
626 	return NET_XMIT_SUCCESS;
627 }
628 
629 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
630 {
631 	s64 toks = diff + cl->tokens;
632 
633 	if (toks > cl->buffer)
634 		toks = cl->buffer;
635 	toks -= (s64) l2t_ns(&cl->rate, bytes);
636 	if (toks <= -cl->mbuffer)
637 		toks = 1 - cl->mbuffer;
638 
639 	cl->tokens = toks;
640 }
641 
642 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
643 {
644 	s64 toks = diff + cl->ctokens;
645 
646 	if (toks > cl->cbuffer)
647 		toks = cl->cbuffer;
648 	toks -= (s64) l2t_ns(&cl->ceil, bytes);
649 	if (toks <= -cl->mbuffer)
650 		toks = 1 - cl->mbuffer;
651 
652 	cl->ctokens = toks;
653 }
654 
655 /**
656  * htb_charge_class - charges amount "bytes" to leaf and ancestors
657  *
658  * Routine assumes that packet "bytes" long was dequeued from leaf cl
659  * borrowing from "level". It accounts bytes to ceil leaky bucket for
660  * leaf and all ancestors and to rate bucket for ancestors at levels
661  * "level" and higher. It also handles possible change of mode resulting
662  * from the update. Note that mode can also increase here (MAY_BORROW to
663  * CAN_SEND) because we can use more precise clock that event queue here.
664  * In such case we remove class from event queue first.
665  */
666 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
667 			     int level, struct sk_buff *skb)
668 {
669 	int bytes = qdisc_pkt_len(skb);
670 	enum htb_cmode old_mode;
671 	s64 diff;
672 
673 	while (cl) {
674 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
675 		if (cl->level >= level) {
676 			if (cl->level == level)
677 				cl->xstats.lends++;
678 			htb_accnt_tokens(cl, bytes, diff);
679 		} else {
680 			cl->xstats.borrows++;
681 			cl->tokens += diff;	/* we moved t_c; update tokens */
682 		}
683 		htb_accnt_ctokens(cl, bytes, diff);
684 		cl->t_c = q->now;
685 
686 		old_mode = cl->cmode;
687 		diff = 0;
688 		htb_change_class_mode(q, cl, &diff);
689 		if (old_mode != cl->cmode) {
690 			if (old_mode != HTB_CAN_SEND)
691 				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
692 			if (cl->cmode != HTB_CAN_SEND)
693 				htb_add_to_wait_tree(q, cl, diff);
694 		}
695 
696 		/* update basic stats except for leaves which are already updated */
697 		if (cl->level)
698 			bstats_update(&cl->bstats, skb);
699 
700 		cl = cl->parent;
701 	}
702 }
703 
704 /**
705  * htb_do_events - make mode changes to classes at the level
706  *
707  * Scans event queue for pending events and applies them. Returns time of
708  * next pending event (0 for no event in pq, q->now for too many events).
709  * Note: Applied are events whose have cl->pq_key <= q->now.
710  */
711 static psched_time_t htb_do_events(struct htb_sched *q, int level,
712 				   unsigned long start)
713 {
714 	/* don't run for longer than 2 jiffies; 2 is used instead of
715 	 * 1 to simplify things when jiffy is going to be incremented
716 	 * too soon
717 	 */
718 	unsigned long stop_at = start + 2;
719 	while (time_before(jiffies, stop_at)) {
720 		struct htb_class *cl;
721 		s64 diff;
722 		struct rb_node *p = rb_first(&q->wait_pq[level]);
723 
724 		if (!p)
725 			return 0;
726 
727 		cl = rb_entry(p, struct htb_class, pq_node);
728 		if (cl->pq_key > q->now)
729 			return cl->pq_key;
730 
731 		htb_safe_rb_erase(p, q->wait_pq + level);
732 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
733 		htb_change_class_mode(q, cl, &diff);
734 		if (cl->cmode != HTB_CAN_SEND)
735 			htb_add_to_wait_tree(q, cl, diff);
736 	}
737 
738 	/* too much load - let's continue after a break for scheduling */
739 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
740 		pr_warning("htb: too many events!\n");
741 		q->warned |= HTB_WARN_TOOMANYEVENTS;
742 	}
743 
744 	return q->now;
745 }
746 
747 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
748  * is no such one exists.
749  */
750 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
751 					      u32 id)
752 {
753 	struct rb_node *r = NULL;
754 	while (n) {
755 		struct htb_class *cl =
756 		    rb_entry(n, struct htb_class, node[prio]);
757 
758 		if (id > cl->common.classid) {
759 			n = n->rb_right;
760 		} else if (id < cl->common.classid) {
761 			r = n;
762 			n = n->rb_left;
763 		} else {
764 			return n;
765 		}
766 	}
767 	return r;
768 }
769 
770 /**
771  * htb_lookup_leaf - returns next leaf class in DRR order
772  *
773  * Find leaf where current feed pointers points to.
774  */
775 static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
776 					 struct rb_node **pptr, u32 * pid)
777 {
778 	int i;
779 	struct {
780 		struct rb_node *root;
781 		struct rb_node **pptr;
782 		u32 *pid;
783 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
784 
785 	BUG_ON(!tree->rb_node);
786 	sp->root = tree->rb_node;
787 	sp->pptr = pptr;
788 	sp->pid = pid;
789 
790 	for (i = 0; i < 65535; i++) {
791 		if (!*sp->pptr && *sp->pid) {
792 			/* ptr was invalidated but id is valid - try to recover
793 			 * the original or next ptr
794 			 */
795 			*sp->pptr =
796 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
797 		}
798 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
799 				 * can become out of date quickly
800 				 */
801 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
802 			*sp->pptr = sp->root;
803 			while ((*sp->pptr)->rb_left)
804 				*sp->pptr = (*sp->pptr)->rb_left;
805 			if (sp > stk) {
806 				sp--;
807 				if (!*sp->pptr) {
808 					WARN_ON(1);
809 					return NULL;
810 				}
811 				htb_next_rb_node(sp->pptr);
812 			}
813 		} else {
814 			struct htb_class *cl;
815 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
816 			if (!cl->level)
817 				return cl;
818 			(++sp)->root = cl->un.inner.feed[prio].rb_node;
819 			sp->pptr = cl->un.inner.ptr + prio;
820 			sp->pid = cl->un.inner.last_ptr_id + prio;
821 		}
822 	}
823 	WARN_ON(1);
824 	return NULL;
825 }
826 
827 /* dequeues packet at given priority and level; call only if
828  * you are sure that there is active class at prio/level
829  */
830 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
831 					int level)
832 {
833 	struct sk_buff *skb = NULL;
834 	struct htb_class *cl, *start;
835 	/* look initial class up in the row */
836 	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
837 				     q->ptr[level] + prio,
838 				     q->last_ptr_id[level] + prio);
839 
840 	do {
841 next:
842 		if (unlikely(!cl))
843 			return NULL;
844 
845 		/* class can be empty - it is unlikely but can be true if leaf
846 		 * qdisc drops packets in enqueue routine or if someone used
847 		 * graft operation on the leaf since last dequeue;
848 		 * simply deactivate and skip such class
849 		 */
850 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
851 			struct htb_class *next;
852 			htb_deactivate(q, cl);
853 
854 			/* row/level might become empty */
855 			if ((q->row_mask[level] & (1 << prio)) == 0)
856 				return NULL;
857 
858 			next = htb_lookup_leaf(q->row[level] + prio,
859 					       prio, q->ptr[level] + prio,
860 					       q->last_ptr_id[level] + prio);
861 
862 			if (cl == start)	/* fix start if we just deleted it */
863 				start = next;
864 			cl = next;
865 			goto next;
866 		}
867 
868 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
869 		if (likely(skb != NULL))
870 			break;
871 
872 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
873 		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
874 				  ptr[0]) + prio);
875 		cl = htb_lookup_leaf(q->row[level] + prio, prio,
876 				     q->ptr[level] + prio,
877 				     q->last_ptr_id[level] + prio);
878 
879 	} while (cl != start);
880 
881 	if (likely(skb != NULL)) {
882 		bstats_update(&cl->bstats, skb);
883 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
884 		if (cl->un.leaf.deficit[level] < 0) {
885 			cl->un.leaf.deficit[level] += cl->quantum;
886 			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
887 					  ptr[0]) + prio);
888 		}
889 		/* this used to be after charge_class but this constelation
890 		 * gives us slightly better performance
891 		 */
892 		if (!cl->un.leaf.q->q.qlen)
893 			htb_deactivate(q, cl);
894 		htb_charge_class(q, cl, level, skb);
895 	}
896 	return skb;
897 }
898 
899 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
900 {
901 	struct sk_buff *skb;
902 	struct htb_sched *q = qdisc_priv(sch);
903 	int level;
904 	psched_time_t next_event;
905 	unsigned long start_at;
906 
907 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
908 	skb = __skb_dequeue(&q->direct_queue);
909 	if (skb != NULL) {
910 ok:
911 		qdisc_bstats_update(sch, skb);
912 		qdisc_unthrottled(sch);
913 		sch->q.qlen--;
914 		return skb;
915 	}
916 
917 	if (!sch->q.qlen)
918 		goto fin;
919 	q->now = ktime_to_ns(ktime_get());
920 	start_at = jiffies;
921 
922 	next_event = q->now + 5LLU * NSEC_PER_SEC;
923 
924 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
925 		/* common case optimization - skip event handler quickly */
926 		int m;
927 		psched_time_t event;
928 
929 		if (q->now >= q->near_ev_cache[level]) {
930 			event = htb_do_events(q, level, start_at);
931 			if (!event)
932 				event = q->now + NSEC_PER_SEC;
933 			q->near_ev_cache[level] = event;
934 		} else
935 			event = q->near_ev_cache[level];
936 
937 		if (next_event > event)
938 			next_event = event;
939 
940 		m = ~q->row_mask[level];
941 		while (m != (int)(-1)) {
942 			int prio = ffz(m);
943 
944 			m |= 1 << prio;
945 			skb = htb_dequeue_tree(q, prio, level);
946 			if (likely(skb != NULL))
947 				goto ok;
948 		}
949 	}
950 	sch->qstats.overlimits++;
951 	if (likely(next_event > q->now)) {
952 		if (!test_bit(__QDISC_STATE_DEACTIVATED,
953 			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
954 			ktime_t time = ns_to_ktime(next_event);
955 			qdisc_throttled(q->watchdog.qdisc);
956 			hrtimer_start(&q->watchdog.timer, time,
957 				      HRTIMER_MODE_ABS);
958 		}
959 	} else {
960 		schedule_work(&q->work);
961 	}
962 fin:
963 	return skb;
964 }
965 
966 /* try to drop from each class (by prio) until one succeed */
967 static unsigned int htb_drop(struct Qdisc *sch)
968 {
969 	struct htb_sched *q = qdisc_priv(sch);
970 	int prio;
971 
972 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
973 		struct list_head *p;
974 		list_for_each(p, q->drops + prio) {
975 			struct htb_class *cl = list_entry(p, struct htb_class,
976 							  un.leaf.drop_list);
977 			unsigned int len;
978 			if (cl->un.leaf.q->ops->drop &&
979 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
980 				sch->q.qlen--;
981 				if (!cl->un.leaf.q->q.qlen)
982 					htb_deactivate(q, cl);
983 				return len;
984 			}
985 		}
986 	}
987 	return 0;
988 }
989 
990 /* reset all classes */
991 /* always caled under BH & queue lock */
992 static void htb_reset(struct Qdisc *sch)
993 {
994 	struct htb_sched *q = qdisc_priv(sch);
995 	struct htb_class *cl;
996 	struct hlist_node *n;
997 	unsigned int i;
998 
999 	for (i = 0; i < q->clhash.hashsize; i++) {
1000 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1001 			if (cl->level)
1002 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
1003 			else {
1004 				if (cl->un.leaf.q)
1005 					qdisc_reset(cl->un.leaf.q);
1006 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1007 			}
1008 			cl->prio_activity = 0;
1009 			cl->cmode = HTB_CAN_SEND;
1010 
1011 		}
1012 	}
1013 	qdisc_watchdog_cancel(&q->watchdog);
1014 	__skb_queue_purge(&q->direct_queue);
1015 	sch->q.qlen = 0;
1016 	memset(q->row, 0, sizeof(q->row));
1017 	memset(q->row_mask, 0, sizeof(q->row_mask));
1018 	memset(q->wait_pq, 0, sizeof(q->wait_pq));
1019 	memset(q->ptr, 0, sizeof(q->ptr));
1020 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1021 		INIT_LIST_HEAD(q->drops + i);
1022 }
1023 
1024 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1025 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1026 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1027 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1028 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1029 };
1030 
1031 static void htb_work_func(struct work_struct *work)
1032 {
1033 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1034 	struct Qdisc *sch = q->watchdog.qdisc;
1035 
1036 	__netif_schedule(qdisc_root(sch));
1037 }
1038 
1039 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1040 {
1041 	struct htb_sched *q = qdisc_priv(sch);
1042 	struct nlattr *tb[TCA_HTB_INIT + 1];
1043 	struct tc_htb_glob *gopt;
1044 	int err;
1045 	int i;
1046 
1047 	if (!opt)
1048 		return -EINVAL;
1049 
1050 	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
1051 	if (err < 0)
1052 		return err;
1053 
1054 	if (tb[TCA_HTB_INIT] == NULL) {
1055 		pr_err("HTB: hey probably you have bad tc tool ?\n");
1056 		return -EINVAL;
1057 	}
1058 	gopt = nla_data(tb[TCA_HTB_INIT]);
1059 	if (gopt->version != HTB_VER >> 16) {
1060 		pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
1061 		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1062 		return -EINVAL;
1063 	}
1064 
1065 	err = qdisc_class_hash_init(&q->clhash);
1066 	if (err < 0)
1067 		return err;
1068 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1069 		INIT_LIST_HEAD(q->drops + i);
1070 
1071 	qdisc_watchdog_init(&q->watchdog, sch);
1072 	INIT_WORK(&q->work, htb_work_func);
1073 	skb_queue_head_init(&q->direct_queue);
1074 
1075 	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1076 	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1077 		q->direct_qlen = 2;
1078 
1079 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1080 		q->rate2quantum = 1;
1081 	q->defcls = gopt->defcls;
1082 
1083 	return 0;
1084 }
1085 
1086 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1087 {
1088 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1089 	struct htb_sched *q = qdisc_priv(sch);
1090 	struct nlattr *nest;
1091 	struct tc_htb_glob gopt;
1092 
1093 	spin_lock_bh(root_lock);
1094 
1095 	gopt.direct_pkts = q->direct_pkts;
1096 	gopt.version = HTB_VER;
1097 	gopt.rate2quantum = q->rate2quantum;
1098 	gopt.defcls = q->defcls;
1099 	gopt.debug = 0;
1100 
1101 	nest = nla_nest_start(skb, TCA_OPTIONS);
1102 	if (nest == NULL)
1103 		goto nla_put_failure;
1104 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt))
1105 		goto nla_put_failure;
1106 	nla_nest_end(skb, nest);
1107 
1108 	spin_unlock_bh(root_lock);
1109 	return skb->len;
1110 
1111 nla_put_failure:
1112 	spin_unlock_bh(root_lock);
1113 	nla_nest_cancel(skb, nest);
1114 	return -1;
1115 }
1116 
1117 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1118 			  struct sk_buff *skb, struct tcmsg *tcm)
1119 {
1120 	struct htb_class *cl = (struct htb_class *)arg;
1121 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1122 	struct nlattr *nest;
1123 	struct tc_htb_opt opt;
1124 
1125 	spin_lock_bh(root_lock);
1126 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1127 	tcm->tcm_handle = cl->common.classid;
1128 	if (!cl->level && cl->un.leaf.q)
1129 		tcm->tcm_info = cl->un.leaf.q->handle;
1130 
1131 	nest = nla_nest_start(skb, TCA_OPTIONS);
1132 	if (nest == NULL)
1133 		goto nla_put_failure;
1134 
1135 	memset(&opt, 0, sizeof(opt));
1136 
1137 	opt.rate.rate = cl->rate.rate_bps >> 3;
1138 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1139 	opt.ceil.rate = cl->ceil.rate_bps >> 3;
1140 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1141 	opt.quantum = cl->quantum;
1142 	opt.prio = cl->prio;
1143 	opt.level = cl->level;
1144 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1145 		goto nla_put_failure;
1146 
1147 	nla_nest_end(skb, nest);
1148 	spin_unlock_bh(root_lock);
1149 	return skb->len;
1150 
1151 nla_put_failure:
1152 	spin_unlock_bh(root_lock);
1153 	nla_nest_cancel(skb, nest);
1154 	return -1;
1155 }
1156 
1157 static int
1158 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1159 {
1160 	struct htb_class *cl = (struct htb_class *)arg;
1161 
1162 	if (!cl->level && cl->un.leaf.q)
1163 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1164 	cl->xstats.tokens = cl->tokens;
1165 	cl->xstats.ctokens = cl->ctokens;
1166 
1167 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1168 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1169 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1170 		return -1;
1171 
1172 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1173 }
1174 
1175 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1176 		     struct Qdisc **old)
1177 {
1178 	struct htb_class *cl = (struct htb_class *)arg;
1179 
1180 	if (cl->level)
1181 		return -EINVAL;
1182 	if (new == NULL &&
1183 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1184 				     cl->common.classid)) == NULL)
1185 		return -ENOBUFS;
1186 
1187 	sch_tree_lock(sch);
1188 	*old = cl->un.leaf.q;
1189 	cl->un.leaf.q = new;
1190 	if (*old != NULL) {
1191 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1192 		qdisc_reset(*old);
1193 	}
1194 	sch_tree_unlock(sch);
1195 	return 0;
1196 }
1197 
1198 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1199 {
1200 	struct htb_class *cl = (struct htb_class *)arg;
1201 	return !cl->level ? cl->un.leaf.q : NULL;
1202 }
1203 
1204 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1205 {
1206 	struct htb_class *cl = (struct htb_class *)arg;
1207 
1208 	if (cl->un.leaf.q->q.qlen == 0)
1209 		htb_deactivate(qdisc_priv(sch), cl);
1210 }
1211 
1212 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1213 {
1214 	struct htb_class *cl = htb_find(classid, sch);
1215 	if (cl)
1216 		cl->refcnt++;
1217 	return (unsigned long)cl;
1218 }
1219 
1220 static inline int htb_parent_last_child(struct htb_class *cl)
1221 {
1222 	if (!cl->parent)
1223 		/* the root class */
1224 		return 0;
1225 	if (cl->parent->children > 1)
1226 		/* not the last child */
1227 		return 0;
1228 	return 1;
1229 }
1230 
1231 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1232 			       struct Qdisc *new_q)
1233 {
1234 	struct htb_class *parent = cl->parent;
1235 
1236 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1237 
1238 	if (parent->cmode != HTB_CAN_SEND)
1239 		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1240 
1241 	parent->level = 0;
1242 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1243 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1244 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1245 	parent->tokens = parent->buffer;
1246 	parent->ctokens = parent->cbuffer;
1247 	parent->t_c = psched_get_time();
1248 	parent->cmode = HTB_CAN_SEND;
1249 }
1250 
1251 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1252 {
1253 	if (!cl->level) {
1254 		WARN_ON(!cl->un.leaf.q);
1255 		qdisc_destroy(cl->un.leaf.q);
1256 	}
1257 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1258 	tcf_destroy_chain(&cl->filter_list);
1259 	kfree(cl);
1260 }
1261 
1262 static void htb_destroy(struct Qdisc *sch)
1263 {
1264 	struct htb_sched *q = qdisc_priv(sch);
1265 	struct hlist_node *n, *next;
1266 	struct htb_class *cl;
1267 	unsigned int i;
1268 
1269 	cancel_work_sync(&q->work);
1270 	qdisc_watchdog_cancel(&q->watchdog);
1271 	/* This line used to be after htb_destroy_class call below
1272 	 * and surprisingly it worked in 2.4. But it must precede it
1273 	 * because filter need its target class alive to be able to call
1274 	 * unbind_filter on it (without Oops).
1275 	 */
1276 	tcf_destroy_chain(&q->filter_list);
1277 
1278 	for (i = 0; i < q->clhash.hashsize; i++) {
1279 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1280 			tcf_destroy_chain(&cl->filter_list);
1281 	}
1282 	for (i = 0; i < q->clhash.hashsize; i++) {
1283 		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1284 					  common.hnode)
1285 			htb_destroy_class(sch, cl);
1286 	}
1287 	qdisc_class_hash_destroy(&q->clhash);
1288 	__skb_queue_purge(&q->direct_queue);
1289 }
1290 
1291 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1292 {
1293 	struct htb_sched *q = qdisc_priv(sch);
1294 	struct htb_class *cl = (struct htb_class *)arg;
1295 	unsigned int qlen;
1296 	struct Qdisc *new_q = NULL;
1297 	int last_child = 0;
1298 
1299 	// TODO: why don't allow to delete subtree ? references ? does
1300 	// tc subsys quarantee us that in htb_destroy it holds no class
1301 	// refs so that we can remove children safely there ?
1302 	if (cl->children || cl->filter_cnt)
1303 		return -EBUSY;
1304 
1305 	if (!cl->level && htb_parent_last_child(cl)) {
1306 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1307 					  cl->parent->common.classid);
1308 		last_child = 1;
1309 	}
1310 
1311 	sch_tree_lock(sch);
1312 
1313 	if (!cl->level) {
1314 		qlen = cl->un.leaf.q->q.qlen;
1315 		qdisc_reset(cl->un.leaf.q);
1316 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1317 	}
1318 
1319 	/* delete from hash and active; remainder in destroy_class */
1320 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1321 	if (cl->parent)
1322 		cl->parent->children--;
1323 
1324 	if (cl->prio_activity)
1325 		htb_deactivate(q, cl);
1326 
1327 	if (cl->cmode != HTB_CAN_SEND)
1328 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1329 
1330 	if (last_child)
1331 		htb_parent_to_leaf(q, cl, new_q);
1332 
1333 	BUG_ON(--cl->refcnt == 0);
1334 	/*
1335 	 * This shouldn't happen: we "hold" one cops->get() when called
1336 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1337 	 */
1338 
1339 	sch_tree_unlock(sch);
1340 	return 0;
1341 }
1342 
1343 static void htb_put(struct Qdisc *sch, unsigned long arg)
1344 {
1345 	struct htb_class *cl = (struct htb_class *)arg;
1346 
1347 	if (--cl->refcnt == 0)
1348 		htb_destroy_class(sch, cl);
1349 }
1350 
1351 static int htb_change_class(struct Qdisc *sch, u32 classid,
1352 			    u32 parentid, struct nlattr **tca,
1353 			    unsigned long *arg)
1354 {
1355 	int err = -EINVAL;
1356 	struct htb_sched *q = qdisc_priv(sch);
1357 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1358 	struct nlattr *opt = tca[TCA_OPTIONS];
1359 	struct nlattr *tb[__TCA_HTB_MAX];
1360 	struct tc_htb_opt *hopt;
1361 
1362 	/* extract all subattrs from opt attr */
1363 	if (!opt)
1364 		goto failure;
1365 
1366 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1367 	if (err < 0)
1368 		goto failure;
1369 
1370 	err = -EINVAL;
1371 	if (tb[TCA_HTB_PARMS] == NULL)
1372 		goto failure;
1373 
1374 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1375 
1376 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1377 	if (!hopt->rate.rate || !hopt->ceil.rate)
1378 		goto failure;
1379 
1380 	if (!cl) {		/* new class */
1381 		struct Qdisc *new_q;
1382 		int prio;
1383 		struct {
1384 			struct nlattr		nla;
1385 			struct gnet_estimator	opt;
1386 		} est = {
1387 			.nla = {
1388 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1389 				.nla_type	= TCA_RATE,
1390 			},
1391 			.opt = {
1392 				/* 4s interval, 16s averaging constant */
1393 				.interval	= 2,
1394 				.ewma_log	= 2,
1395 			},
1396 		};
1397 
1398 		/* check for valid classid */
1399 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1400 		    htb_find(classid, sch))
1401 			goto failure;
1402 
1403 		/* check maximal depth */
1404 		if (parent && parent->parent && parent->parent->level < 2) {
1405 			pr_err("htb: tree is too deep\n");
1406 			goto failure;
1407 		}
1408 		err = -ENOBUFS;
1409 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1410 		if (!cl)
1411 			goto failure;
1412 
1413 		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1414 					qdisc_root_sleeping_lock(sch),
1415 					tca[TCA_RATE] ? : &est.nla);
1416 		if (err) {
1417 			kfree(cl);
1418 			goto failure;
1419 		}
1420 
1421 		cl->refcnt = 1;
1422 		cl->children = 0;
1423 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1424 		RB_CLEAR_NODE(&cl->pq_node);
1425 
1426 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1427 			RB_CLEAR_NODE(&cl->node[prio]);
1428 
1429 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1430 		 * so that can't be used inside of sch_tree_lock
1431 		 * -- thanks to Karlis Peisenieks
1432 		 */
1433 		new_q = qdisc_create_dflt(sch->dev_queue,
1434 					  &pfifo_qdisc_ops, classid);
1435 		sch_tree_lock(sch);
1436 		if (parent && !parent->level) {
1437 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1438 
1439 			/* turn parent into inner node */
1440 			qdisc_reset(parent->un.leaf.q);
1441 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1442 			qdisc_destroy(parent->un.leaf.q);
1443 			if (parent->prio_activity)
1444 				htb_deactivate(q, parent);
1445 
1446 			/* remove from evt list because of level change */
1447 			if (parent->cmode != HTB_CAN_SEND) {
1448 				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1449 				parent->cmode = HTB_CAN_SEND;
1450 			}
1451 			parent->level = (parent->parent ? parent->parent->level
1452 					 : TC_HTB_MAXDEPTH) - 1;
1453 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1454 		}
1455 		/* leaf (we) needs elementary qdisc */
1456 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1457 
1458 		cl->common.classid = classid;
1459 		cl->parent = parent;
1460 
1461 		/* set class to be in HTB_CAN_SEND state */
1462 		cl->tokens = hopt->buffer;
1463 		cl->ctokens = hopt->cbuffer;
1464 		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1465 		cl->t_c = psched_get_time();
1466 		cl->cmode = HTB_CAN_SEND;
1467 
1468 		/* attach to the hash list and parent's family */
1469 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1470 		if (parent)
1471 			parent->children++;
1472 	} else {
1473 		if (tca[TCA_RATE]) {
1474 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1475 						    qdisc_root_sleeping_lock(sch),
1476 						    tca[TCA_RATE]);
1477 			if (err)
1478 				return err;
1479 		}
1480 		sch_tree_lock(sch);
1481 	}
1482 
1483 	/* it used to be a nasty bug here, we have to check that node
1484 	 * is really leaf before changing cl->un.leaf !
1485 	 */
1486 	if (!cl->level) {
1487 		cl->quantum = hopt->rate.rate / q->rate2quantum;
1488 		if (!hopt->quantum && cl->quantum < 1000) {
1489 			pr_warning(
1490 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1491 			       cl->common.classid);
1492 			cl->quantum = 1000;
1493 		}
1494 		if (!hopt->quantum && cl->quantum > 200000) {
1495 			pr_warning(
1496 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1497 			       cl->common.classid);
1498 			cl->quantum = 200000;
1499 		}
1500 		if (hopt->quantum)
1501 			cl->quantum = hopt->quantum;
1502 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1503 			cl->prio = TC_HTB_NUMPRIO - 1;
1504 	}
1505 
1506 	cl->buffer = hopt->buffer;
1507 	cl->cbuffer = hopt->cbuffer;
1508 
1509 	cl->rate.rate_bps = (u64)hopt->rate.rate << 3;
1510 	cl->ceil.rate_bps = (u64)hopt->ceil.rate << 3;
1511 
1512 	htb_precompute_ratedata(&cl->rate);
1513 	htb_precompute_ratedata(&cl->ceil);
1514 
1515 	cl->buffer = hopt->buffer << PSCHED_SHIFT;
1516 	cl->cbuffer = hopt->buffer << PSCHED_SHIFT;
1517 
1518 	sch_tree_unlock(sch);
1519 
1520 	qdisc_class_hash_grow(sch, &q->clhash);
1521 
1522 	*arg = (unsigned long)cl;
1523 	return 0;
1524 
1525 failure:
1526 	return err;
1527 }
1528 
1529 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1530 {
1531 	struct htb_sched *q = qdisc_priv(sch);
1532 	struct htb_class *cl = (struct htb_class *)arg;
1533 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1534 
1535 	return fl;
1536 }
1537 
1538 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1539 				     u32 classid)
1540 {
1541 	struct htb_class *cl = htb_find(classid, sch);
1542 
1543 	/*if (cl && !cl->level) return 0;
1544 	 * The line above used to be there to prevent attaching filters to
1545 	 * leaves. But at least tc_index filter uses this just to get class
1546 	 * for other reasons so that we have to allow for it.
1547 	 * ----
1548 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1549 	 * another way to "lock" the class - unlike "get" this lock can
1550 	 * be broken by class during destroy IIUC.
1551 	 */
1552 	if (cl)
1553 		cl->filter_cnt++;
1554 	return (unsigned long)cl;
1555 }
1556 
1557 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1558 {
1559 	struct htb_class *cl = (struct htb_class *)arg;
1560 
1561 	if (cl)
1562 		cl->filter_cnt--;
1563 }
1564 
1565 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1566 {
1567 	struct htb_sched *q = qdisc_priv(sch);
1568 	struct htb_class *cl;
1569 	struct hlist_node *n;
1570 	unsigned int i;
1571 
1572 	if (arg->stop)
1573 		return;
1574 
1575 	for (i = 0; i < q->clhash.hashsize; i++) {
1576 		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1577 			if (arg->count < arg->skip) {
1578 				arg->count++;
1579 				continue;
1580 			}
1581 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1582 				arg->stop = 1;
1583 				return;
1584 			}
1585 			arg->count++;
1586 		}
1587 	}
1588 }
1589 
1590 static const struct Qdisc_class_ops htb_class_ops = {
1591 	.graft		=	htb_graft,
1592 	.leaf		=	htb_leaf,
1593 	.qlen_notify	=	htb_qlen_notify,
1594 	.get		=	htb_get,
1595 	.put		=	htb_put,
1596 	.change		=	htb_change_class,
1597 	.delete		=	htb_delete,
1598 	.walk		=	htb_walk,
1599 	.tcf_chain	=	htb_find_tcf,
1600 	.bind_tcf	=	htb_bind_filter,
1601 	.unbind_tcf	=	htb_unbind_filter,
1602 	.dump		=	htb_dump_class,
1603 	.dump_stats	=	htb_dump_class_stats,
1604 };
1605 
1606 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1607 	.cl_ops		=	&htb_class_ops,
1608 	.id		=	"htb",
1609 	.priv_size	=	sizeof(struct htb_sched),
1610 	.enqueue	=	htb_enqueue,
1611 	.dequeue	=	htb_dequeue,
1612 	.peek		=	qdisc_peek_dequeued,
1613 	.drop		=	htb_drop,
1614 	.init		=	htb_init,
1615 	.reset		=	htb_reset,
1616 	.destroy	=	htb_destroy,
1617 	.dump		=	htb_dump,
1618 	.owner		=	THIS_MODULE,
1619 };
1620 
1621 static int __init htb_module_init(void)
1622 {
1623 	return register_qdisc(&htb_qdisc_ops);
1624 }
1625 static void __exit htb_module_exit(void)
1626 {
1627 	unregister_qdisc(&htb_qdisc_ops);
1628 }
1629 
1630 module_init(htb_module_init)
1631 module_exit(htb_module_exit)
1632 MODULE_LICENSE("GPL");
1633