xref: /openbmc/linux/net/sched/sch_htb.c (revision c4ee0af3)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 
44 /* HTB algorithm.
45     Author: devik@cdi.cz
46     ========================================================================
47     HTB is like TBF with multiple classes. It is also similar to CBQ because
48     it allows to assign priority to each class in hierarchy.
49     In fact it is another implementation of Floyd's formal sharing.
50 
51     Levels:
52     Each class is assigned level. Leaf has ALWAYS level 0 and root
53     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
54     one less than their parent.
55 */
56 
57 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
58 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
59 
60 #if HTB_VER >> 16 != TC_HTB_PROTOVER
61 #error "Mismatched sch_htb.c and pkt_sch.h"
62 #endif
63 
64 /* Module parameter and sysfs export */
65 module_param    (htb_hysteresis, int, 0640);
66 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
67 
68 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
69 module_param(htb_rate_est, int, 0640);
70 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
71 
72 /* used internaly to keep status of single class */
73 enum htb_cmode {
74 	HTB_CANT_SEND,		/* class can't send and can't borrow */
75 	HTB_MAY_BORROW,		/* class can't send but may borrow */
76 	HTB_CAN_SEND		/* class can send */
77 };
78 
79 struct htb_prio {
80 	union {
81 		struct rb_root	row;
82 		struct rb_root	feed;
83 	};
84 	struct rb_node	*ptr;
85 	/* When class changes from state 1->2 and disconnects from
86 	 * parent's feed then we lost ptr value and start from the
87 	 * first child again. Here we store classid of the
88 	 * last valid ptr (used when ptr is NULL).
89 	 */
90 	u32		last_ptr_id;
91 };
92 
93 /* interior & leaf nodes; props specific to leaves are marked L:
94  * To reduce false sharing, place mostly read fields at beginning,
95  * and mostly written ones at the end.
96  */
97 struct htb_class {
98 	struct Qdisc_class_common common;
99 	struct psched_ratecfg	rate;
100 	struct psched_ratecfg	ceil;
101 	s64			buffer, cbuffer;/* token bucket depth/rate */
102 	s64			mbuffer;	/* max wait time */
103 	u32			prio;		/* these two are used only by leaves... */
104 	int			quantum;	/* but stored for parent-to-leaf return */
105 
106 	struct tcf_proto	*filter_list;	/* class attached filters */
107 	int			filter_cnt;
108 	int			refcnt;		/* usage count of this class */
109 
110 	int			level;		/* our level (see above) */
111 	unsigned int		children;
112 	struct htb_class	*parent;	/* parent class */
113 
114 	struct gnet_stats_rate_est64 rate_est;
115 
116 	/*
117 	 * Written often fields
118 	 */
119 	struct gnet_stats_basic_packed bstats;
120 	struct gnet_stats_queue	qstats;
121 	struct tc_htb_xstats	xstats;	/* our special stats */
122 
123 	/* token bucket parameters */
124 	s64			tokens, ctokens;/* current number of tokens */
125 	s64			t_c;		/* checkpoint time */
126 
127 	union {
128 		struct htb_class_leaf {
129 			struct list_head drop_list;
130 			int		deficit[TC_HTB_MAXDEPTH];
131 			struct Qdisc	*q;
132 		} leaf;
133 		struct htb_class_inner {
134 			struct htb_prio clprio[TC_HTB_NUMPRIO];
135 		} inner;
136 	} un;
137 	s64			pq_key;
138 
139 	int			prio_activity;	/* for which prios are we active */
140 	enum htb_cmode		cmode;		/* current mode of the class */
141 	struct rb_node		pq_node;	/* node for event queue */
142 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto	*filter_list;
157 
158 #define HTB_WARN_TOOMANYEVENTS	0x1
159 	unsigned int		warned;	/* only one warning */
160 	int			direct_qlen;
161 	struct work_struct	work;
162 
163 	/* non shaped skbs; let them go directly thru */
164 	struct sk_buff_head	direct_queue;
165 	long			direct_pkts;
166 
167 	struct qdisc_watchdog	watchdog;
168 
169 	s64			now;	/* cached dequeue time */
170 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
171 
172 	/* time of nearest event per level (row) */
173 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
174 
175 	int			row_mask[TC_HTB_MAXDEPTH];
176 
177 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
178 };
179 
180 /* find class in global hash table using given handle */
181 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
182 {
183 	struct htb_sched *q = qdisc_priv(sch);
184 	struct Qdisc_class_common *clc;
185 
186 	clc = qdisc_class_find(&q->clhash, handle);
187 	if (clc == NULL)
188 		return NULL;
189 	return container_of(clc, struct htb_class, common);
190 }
191 
192 /**
193  * htb_classify - classify a packet into class
194  *
195  * It returns NULL if the packet should be dropped or -1 if the packet
196  * should be passed directly thru. In all other cases leaf class is returned.
197  * We allow direct class selection by classid in priority. The we examine
198  * filters in qdisc and in inner nodes (if higher filter points to the inner
199  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
200  * internal fifo (direct). These packets then go directly thru. If we still
201  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
202  * then finish and return direct queue.
203  */
204 #define HTB_DIRECT ((struct htb_class *)-1L)
205 
206 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
207 				      int *qerr)
208 {
209 	struct htb_sched *q = qdisc_priv(sch);
210 	struct htb_class *cl;
211 	struct tcf_result res;
212 	struct tcf_proto *tcf;
213 	int result;
214 
215 	/* allow to select class by setting skb->priority to valid classid;
216 	 * note that nfmark can be used too by attaching filter fw with no
217 	 * rules in it
218 	 */
219 	if (skb->priority == sch->handle)
220 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
221 	cl = htb_find(skb->priority, sch);
222 	if (cl && cl->level == 0)
223 		return cl;
224 
225 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
226 	tcf = q->filter_list;
227 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
228 #ifdef CONFIG_NET_CLS_ACT
229 		switch (result) {
230 		case TC_ACT_QUEUED:
231 		case TC_ACT_STOLEN:
232 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
233 		case TC_ACT_SHOT:
234 			return NULL;
235 		}
236 #endif
237 		cl = (void *)res.class;
238 		if (!cl) {
239 			if (res.classid == sch->handle)
240 				return HTB_DIRECT;	/* X:0 (direct flow) */
241 			cl = htb_find(res.classid, sch);
242 			if (!cl)
243 				break;	/* filter selected invalid classid */
244 		}
245 		if (!cl->level)
246 			return cl;	/* we hit leaf; return it */
247 
248 		/* we have got inner class; apply inner filter chain */
249 		tcf = cl->filter_list;
250 	}
251 	/* classification failed; try to use default class */
252 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
253 	if (!cl || cl->level)
254 		return HTB_DIRECT;	/* bad default .. this is safe bet */
255 	return cl;
256 }
257 
258 /**
259  * htb_add_to_id_tree - adds class to the round robin list
260  *
261  * Routine adds class to the list (actually tree) sorted by classid.
262  * Make sure that class is not already on such list for given prio.
263  */
264 static void htb_add_to_id_tree(struct rb_root *root,
265 			       struct htb_class *cl, int prio)
266 {
267 	struct rb_node **p = &root->rb_node, *parent = NULL;
268 
269 	while (*p) {
270 		struct htb_class *c;
271 		parent = *p;
272 		c = rb_entry(parent, struct htb_class, node[prio]);
273 
274 		if (cl->common.classid > c->common.classid)
275 			p = &parent->rb_right;
276 		else
277 			p = &parent->rb_left;
278 	}
279 	rb_link_node(&cl->node[prio], parent, p);
280 	rb_insert_color(&cl->node[prio], root);
281 }
282 
283 /**
284  * htb_add_to_wait_tree - adds class to the event queue with delay
285  *
286  * The class is added to priority event queue to indicate that class will
287  * change its mode in cl->pq_key microseconds. Make sure that class is not
288  * already in the queue.
289  */
290 static void htb_add_to_wait_tree(struct htb_sched *q,
291 				 struct htb_class *cl, s64 delay)
292 {
293 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
294 
295 	cl->pq_key = q->now + delay;
296 	if (cl->pq_key == q->now)
297 		cl->pq_key++;
298 
299 	/* update the nearest event cache */
300 	if (q->near_ev_cache[cl->level] > cl->pq_key)
301 		q->near_ev_cache[cl->level] = cl->pq_key;
302 
303 	while (*p) {
304 		struct htb_class *c;
305 		parent = *p;
306 		c = rb_entry(parent, struct htb_class, pq_node);
307 		if (cl->pq_key >= c->pq_key)
308 			p = &parent->rb_right;
309 		else
310 			p = &parent->rb_left;
311 	}
312 	rb_link_node(&cl->pq_node, parent, p);
313 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
314 }
315 
316 /**
317  * htb_next_rb_node - finds next node in binary tree
318  *
319  * When we are past last key we return NULL.
320  * Average complexity is 2 steps per call.
321  */
322 static inline void htb_next_rb_node(struct rb_node **n)
323 {
324 	*n = rb_next(*n);
325 }
326 
327 /**
328  * htb_add_class_to_row - add class to its row
329  *
330  * The class is added to row at priorities marked in mask.
331  * It does nothing if mask == 0.
332  */
333 static inline void htb_add_class_to_row(struct htb_sched *q,
334 					struct htb_class *cl, int mask)
335 {
336 	q->row_mask[cl->level] |= mask;
337 	while (mask) {
338 		int prio = ffz(~mask);
339 		mask &= ~(1 << prio);
340 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
341 	}
342 }
343 
344 /* If this triggers, it is a bug in this code, but it need not be fatal */
345 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
346 {
347 	if (RB_EMPTY_NODE(rb)) {
348 		WARN_ON(1);
349 	} else {
350 		rb_erase(rb, root);
351 		RB_CLEAR_NODE(rb);
352 	}
353 }
354 
355 
356 /**
357  * htb_remove_class_from_row - removes class from its row
358  *
359  * The class is removed from row at priorities marked in mask.
360  * It does nothing if mask == 0.
361  */
362 static inline void htb_remove_class_from_row(struct htb_sched *q,
363 						 struct htb_class *cl, int mask)
364 {
365 	int m = 0;
366 	struct htb_level *hlevel = &q->hlevel[cl->level];
367 
368 	while (mask) {
369 		int prio = ffz(~mask);
370 		struct htb_prio *hprio = &hlevel->hprio[prio];
371 
372 		mask &= ~(1 << prio);
373 		if (hprio->ptr == cl->node + prio)
374 			htb_next_rb_node(&hprio->ptr);
375 
376 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
377 		if (!hprio->row.rb_node)
378 			m |= 1 << prio;
379 	}
380 	q->row_mask[cl->level] &= ~m;
381 }
382 
383 /**
384  * htb_activate_prios - creates active classe's feed chain
385  *
386  * The class is connected to ancestors and/or appropriate rows
387  * for priorities it is participating on. cl->cmode must be new
388  * (activated) mode. It does nothing if cl->prio_activity == 0.
389  */
390 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
391 {
392 	struct htb_class *p = cl->parent;
393 	long m, mask = cl->prio_activity;
394 
395 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
396 		m = mask;
397 		while (m) {
398 			int prio = ffz(~m);
399 			m &= ~(1 << prio);
400 
401 			if (p->un.inner.clprio[prio].feed.rb_node)
402 				/* parent already has its feed in use so that
403 				 * reset bit in mask as parent is already ok
404 				 */
405 				mask &= ~(1 << prio);
406 
407 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
408 		}
409 		p->prio_activity |= mask;
410 		cl = p;
411 		p = cl->parent;
412 
413 	}
414 	if (cl->cmode == HTB_CAN_SEND && mask)
415 		htb_add_class_to_row(q, cl, mask);
416 }
417 
418 /**
419  * htb_deactivate_prios - remove class from feed chain
420  *
421  * cl->cmode must represent old mode (before deactivation). It does
422  * nothing if cl->prio_activity == 0. Class is removed from all feed
423  * chains and rows.
424  */
425 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
426 {
427 	struct htb_class *p = cl->parent;
428 	long m, mask = cl->prio_activity;
429 
430 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
431 		m = mask;
432 		mask = 0;
433 		while (m) {
434 			int prio = ffz(~m);
435 			m &= ~(1 << prio);
436 
437 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
438 				/* we are removing child which is pointed to from
439 				 * parent feed - forget the pointer but remember
440 				 * classid
441 				 */
442 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
443 				p->un.inner.clprio[prio].ptr = NULL;
444 			}
445 
446 			htb_safe_rb_erase(cl->node + prio,
447 					  &p->un.inner.clprio[prio].feed);
448 
449 			if (!p->un.inner.clprio[prio].feed.rb_node)
450 				mask |= 1 << prio;
451 		}
452 
453 		p->prio_activity &= ~mask;
454 		cl = p;
455 		p = cl->parent;
456 
457 	}
458 	if (cl->cmode == HTB_CAN_SEND && mask)
459 		htb_remove_class_from_row(q, cl, mask);
460 }
461 
462 static inline s64 htb_lowater(const struct htb_class *cl)
463 {
464 	if (htb_hysteresis)
465 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
466 	else
467 		return 0;
468 }
469 static inline s64 htb_hiwater(const struct htb_class *cl)
470 {
471 	if (htb_hysteresis)
472 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
473 	else
474 		return 0;
475 }
476 
477 
478 /**
479  * htb_class_mode - computes and returns current class mode
480  *
481  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
482  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
483  * from now to time when cl will change its state.
484  * Also it is worth to note that class mode doesn't change simply
485  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
486  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
487  * mode transitions per time unit. The speed gain is about 1/6.
488  */
489 static inline enum htb_cmode
490 htb_class_mode(struct htb_class *cl, s64 *diff)
491 {
492 	s64 toks;
493 
494 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
495 		*diff = -toks;
496 		return HTB_CANT_SEND;
497 	}
498 
499 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
500 		return HTB_CAN_SEND;
501 
502 	*diff = -toks;
503 	return HTB_MAY_BORROW;
504 }
505 
506 /**
507  * htb_change_class_mode - changes classe's mode
508  *
509  * This should be the only way how to change classe's mode under normal
510  * cirsumstances. Routine will update feed lists linkage, change mode
511  * and add class to the wait event queue if appropriate. New mode should
512  * be different from old one and cl->pq_key has to be valid if changing
513  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
514  */
515 static void
516 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
517 {
518 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
519 
520 	if (new_mode == cl->cmode)
521 		return;
522 
523 	if (cl->prio_activity) {	/* not necessary: speed optimization */
524 		if (cl->cmode != HTB_CANT_SEND)
525 			htb_deactivate_prios(q, cl);
526 		cl->cmode = new_mode;
527 		if (new_mode != HTB_CANT_SEND)
528 			htb_activate_prios(q, cl);
529 	} else
530 		cl->cmode = new_mode;
531 }
532 
533 /**
534  * htb_activate - inserts leaf cl into appropriate active feeds
535  *
536  * Routine learns (new) priority of leaf and activates feed chain
537  * for the prio. It can be called on already active leaf safely.
538  * It also adds leaf into droplist.
539  */
540 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
541 {
542 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
543 
544 	if (!cl->prio_activity) {
545 		cl->prio_activity = 1 << cl->prio;
546 		htb_activate_prios(q, cl);
547 		list_add_tail(&cl->un.leaf.drop_list,
548 			      q->drops + cl->prio);
549 	}
550 }
551 
552 /**
553  * htb_deactivate - remove leaf cl from active feeds
554  *
555  * Make sure that leaf is active. In the other words it can't be called
556  * with non-active leaf. It also removes class from the drop list.
557  */
558 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
559 {
560 	WARN_ON(!cl->prio_activity);
561 
562 	htb_deactivate_prios(q, cl);
563 	cl->prio_activity = 0;
564 	list_del_init(&cl->un.leaf.drop_list);
565 }
566 
567 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
568 {
569 	int uninitialized_var(ret);
570 	struct htb_sched *q = qdisc_priv(sch);
571 	struct htb_class *cl = htb_classify(skb, sch, &ret);
572 
573 	if (cl == HTB_DIRECT) {
574 		/* enqueue to helper queue */
575 		if (q->direct_queue.qlen < q->direct_qlen) {
576 			__skb_queue_tail(&q->direct_queue, skb);
577 			q->direct_pkts++;
578 		} else {
579 			return qdisc_drop(skb, sch);
580 		}
581 #ifdef CONFIG_NET_CLS_ACT
582 	} else if (!cl) {
583 		if (ret & __NET_XMIT_BYPASS)
584 			sch->qstats.drops++;
585 		kfree_skb(skb);
586 		return ret;
587 #endif
588 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
589 		if (net_xmit_drop_count(ret)) {
590 			sch->qstats.drops++;
591 			cl->qstats.drops++;
592 		}
593 		return ret;
594 	} else {
595 		htb_activate(q, cl);
596 	}
597 
598 	sch->q.qlen++;
599 	return NET_XMIT_SUCCESS;
600 }
601 
602 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
603 {
604 	s64 toks = diff + cl->tokens;
605 
606 	if (toks > cl->buffer)
607 		toks = cl->buffer;
608 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
609 	if (toks <= -cl->mbuffer)
610 		toks = 1 - cl->mbuffer;
611 
612 	cl->tokens = toks;
613 }
614 
615 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
616 {
617 	s64 toks = diff + cl->ctokens;
618 
619 	if (toks > cl->cbuffer)
620 		toks = cl->cbuffer;
621 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
622 	if (toks <= -cl->mbuffer)
623 		toks = 1 - cl->mbuffer;
624 
625 	cl->ctokens = toks;
626 }
627 
628 /**
629  * htb_charge_class - charges amount "bytes" to leaf and ancestors
630  *
631  * Routine assumes that packet "bytes" long was dequeued from leaf cl
632  * borrowing from "level". It accounts bytes to ceil leaky bucket for
633  * leaf and all ancestors and to rate bucket for ancestors at levels
634  * "level" and higher. It also handles possible change of mode resulting
635  * from the update. Note that mode can also increase here (MAY_BORROW to
636  * CAN_SEND) because we can use more precise clock that event queue here.
637  * In such case we remove class from event queue first.
638  */
639 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
640 			     int level, struct sk_buff *skb)
641 {
642 	int bytes = qdisc_pkt_len(skb);
643 	enum htb_cmode old_mode;
644 	s64 diff;
645 
646 	while (cl) {
647 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
648 		if (cl->level >= level) {
649 			if (cl->level == level)
650 				cl->xstats.lends++;
651 			htb_accnt_tokens(cl, bytes, diff);
652 		} else {
653 			cl->xstats.borrows++;
654 			cl->tokens += diff;	/* we moved t_c; update tokens */
655 		}
656 		htb_accnt_ctokens(cl, bytes, diff);
657 		cl->t_c = q->now;
658 
659 		old_mode = cl->cmode;
660 		diff = 0;
661 		htb_change_class_mode(q, cl, &diff);
662 		if (old_mode != cl->cmode) {
663 			if (old_mode != HTB_CAN_SEND)
664 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
665 			if (cl->cmode != HTB_CAN_SEND)
666 				htb_add_to_wait_tree(q, cl, diff);
667 		}
668 
669 		/* update basic stats except for leaves which are already updated */
670 		if (cl->level)
671 			bstats_update(&cl->bstats, skb);
672 
673 		cl = cl->parent;
674 	}
675 }
676 
677 /**
678  * htb_do_events - make mode changes to classes at the level
679  *
680  * Scans event queue for pending events and applies them. Returns time of
681  * next pending event (0 for no event in pq, q->now for too many events).
682  * Note: Applied are events whose have cl->pq_key <= q->now.
683  */
684 static s64 htb_do_events(struct htb_sched *q, const int level,
685 			 unsigned long start)
686 {
687 	/* don't run for longer than 2 jiffies; 2 is used instead of
688 	 * 1 to simplify things when jiffy is going to be incremented
689 	 * too soon
690 	 */
691 	unsigned long stop_at = start + 2;
692 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
693 
694 	while (time_before(jiffies, stop_at)) {
695 		struct htb_class *cl;
696 		s64 diff;
697 		struct rb_node *p = rb_first(wait_pq);
698 
699 		if (!p)
700 			return 0;
701 
702 		cl = rb_entry(p, struct htb_class, pq_node);
703 		if (cl->pq_key > q->now)
704 			return cl->pq_key;
705 
706 		htb_safe_rb_erase(p, wait_pq);
707 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
708 		htb_change_class_mode(q, cl, &diff);
709 		if (cl->cmode != HTB_CAN_SEND)
710 			htb_add_to_wait_tree(q, cl, diff);
711 	}
712 
713 	/* too much load - let's continue after a break for scheduling */
714 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
715 		pr_warning("htb: too many events!\n");
716 		q->warned |= HTB_WARN_TOOMANYEVENTS;
717 	}
718 
719 	return q->now;
720 }
721 
722 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
723  * is no such one exists.
724  */
725 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
726 					      u32 id)
727 {
728 	struct rb_node *r = NULL;
729 	while (n) {
730 		struct htb_class *cl =
731 		    rb_entry(n, struct htb_class, node[prio]);
732 
733 		if (id > cl->common.classid) {
734 			n = n->rb_right;
735 		} else if (id < cl->common.classid) {
736 			r = n;
737 			n = n->rb_left;
738 		} else {
739 			return n;
740 		}
741 	}
742 	return r;
743 }
744 
745 /**
746  * htb_lookup_leaf - returns next leaf class in DRR order
747  *
748  * Find leaf where current feed pointers points to.
749  */
750 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
751 {
752 	int i;
753 	struct {
754 		struct rb_node *root;
755 		struct rb_node **pptr;
756 		u32 *pid;
757 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
758 
759 	BUG_ON(!hprio->row.rb_node);
760 	sp->root = hprio->row.rb_node;
761 	sp->pptr = &hprio->ptr;
762 	sp->pid = &hprio->last_ptr_id;
763 
764 	for (i = 0; i < 65535; i++) {
765 		if (!*sp->pptr && *sp->pid) {
766 			/* ptr was invalidated but id is valid - try to recover
767 			 * the original or next ptr
768 			 */
769 			*sp->pptr =
770 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
771 		}
772 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
773 				 * can become out of date quickly
774 				 */
775 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
776 			*sp->pptr = sp->root;
777 			while ((*sp->pptr)->rb_left)
778 				*sp->pptr = (*sp->pptr)->rb_left;
779 			if (sp > stk) {
780 				sp--;
781 				if (!*sp->pptr) {
782 					WARN_ON(1);
783 					return NULL;
784 				}
785 				htb_next_rb_node(sp->pptr);
786 			}
787 		} else {
788 			struct htb_class *cl;
789 			struct htb_prio *clp;
790 
791 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
792 			if (!cl->level)
793 				return cl;
794 			clp = &cl->un.inner.clprio[prio];
795 			(++sp)->root = clp->feed.rb_node;
796 			sp->pptr = &clp->ptr;
797 			sp->pid = &clp->last_ptr_id;
798 		}
799 	}
800 	WARN_ON(1);
801 	return NULL;
802 }
803 
804 /* dequeues packet at given priority and level; call only if
805  * you are sure that there is active class at prio/level
806  */
807 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
808 					const int level)
809 {
810 	struct sk_buff *skb = NULL;
811 	struct htb_class *cl, *start;
812 	struct htb_level *hlevel = &q->hlevel[level];
813 	struct htb_prio *hprio = &hlevel->hprio[prio];
814 
815 	/* look initial class up in the row */
816 	start = cl = htb_lookup_leaf(hprio, prio);
817 
818 	do {
819 next:
820 		if (unlikely(!cl))
821 			return NULL;
822 
823 		/* class can be empty - it is unlikely but can be true if leaf
824 		 * qdisc drops packets in enqueue routine or if someone used
825 		 * graft operation on the leaf since last dequeue;
826 		 * simply deactivate and skip such class
827 		 */
828 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
829 			struct htb_class *next;
830 			htb_deactivate(q, cl);
831 
832 			/* row/level might become empty */
833 			if ((q->row_mask[level] & (1 << prio)) == 0)
834 				return NULL;
835 
836 			next = htb_lookup_leaf(hprio, prio);
837 
838 			if (cl == start)	/* fix start if we just deleted it */
839 				start = next;
840 			cl = next;
841 			goto next;
842 		}
843 
844 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
845 		if (likely(skb != NULL))
846 			break;
847 
848 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
849 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
850 					 &q->hlevel[0].hprio[prio].ptr);
851 		cl = htb_lookup_leaf(hprio, prio);
852 
853 	} while (cl != start);
854 
855 	if (likely(skb != NULL)) {
856 		bstats_update(&cl->bstats, skb);
857 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
858 		if (cl->un.leaf.deficit[level] < 0) {
859 			cl->un.leaf.deficit[level] += cl->quantum;
860 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
861 						 &q->hlevel[0].hprio[prio].ptr);
862 		}
863 		/* this used to be after charge_class but this constelation
864 		 * gives us slightly better performance
865 		 */
866 		if (!cl->un.leaf.q->q.qlen)
867 			htb_deactivate(q, cl);
868 		htb_charge_class(q, cl, level, skb);
869 	}
870 	return skb;
871 }
872 
873 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
874 {
875 	struct sk_buff *skb;
876 	struct htb_sched *q = qdisc_priv(sch);
877 	int level;
878 	s64 next_event;
879 	unsigned long start_at;
880 
881 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
882 	skb = __skb_dequeue(&q->direct_queue);
883 	if (skb != NULL) {
884 ok:
885 		qdisc_bstats_update(sch, skb);
886 		qdisc_unthrottled(sch);
887 		sch->q.qlen--;
888 		return skb;
889 	}
890 
891 	if (!sch->q.qlen)
892 		goto fin;
893 	q->now = ktime_to_ns(ktime_get());
894 	start_at = jiffies;
895 
896 	next_event = q->now + 5LLU * NSEC_PER_SEC;
897 
898 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
899 		/* common case optimization - skip event handler quickly */
900 		int m;
901 		s64 event = q->near_ev_cache[level];
902 
903 		if (q->now >= event) {
904 			event = htb_do_events(q, level, start_at);
905 			if (!event)
906 				event = q->now + NSEC_PER_SEC;
907 			q->near_ev_cache[level] = event;
908 		}
909 
910 		if (next_event > event)
911 			next_event = event;
912 
913 		m = ~q->row_mask[level];
914 		while (m != (int)(-1)) {
915 			int prio = ffz(m);
916 
917 			m |= 1 << prio;
918 			skb = htb_dequeue_tree(q, prio, level);
919 			if (likely(skb != NULL))
920 				goto ok;
921 		}
922 	}
923 	sch->qstats.overlimits++;
924 	if (likely(next_event > q->now)) {
925 		if (!test_bit(__QDISC_STATE_DEACTIVATED,
926 			      &qdisc_root_sleeping(q->watchdog.qdisc)->state)) {
927 			ktime_t time = ns_to_ktime(next_event);
928 			qdisc_throttled(q->watchdog.qdisc);
929 			hrtimer_start(&q->watchdog.timer, time,
930 				      HRTIMER_MODE_ABS);
931 		}
932 	} else {
933 		schedule_work(&q->work);
934 	}
935 fin:
936 	return skb;
937 }
938 
939 /* try to drop from each class (by prio) until one succeed */
940 static unsigned int htb_drop(struct Qdisc *sch)
941 {
942 	struct htb_sched *q = qdisc_priv(sch);
943 	int prio;
944 
945 	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
946 		struct list_head *p;
947 		list_for_each(p, q->drops + prio) {
948 			struct htb_class *cl = list_entry(p, struct htb_class,
949 							  un.leaf.drop_list);
950 			unsigned int len;
951 			if (cl->un.leaf.q->ops->drop &&
952 			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
953 				sch->q.qlen--;
954 				if (!cl->un.leaf.q->q.qlen)
955 					htb_deactivate(q, cl);
956 				return len;
957 			}
958 		}
959 	}
960 	return 0;
961 }
962 
963 /* reset all classes */
964 /* always caled under BH & queue lock */
965 static void htb_reset(struct Qdisc *sch)
966 {
967 	struct htb_sched *q = qdisc_priv(sch);
968 	struct htb_class *cl;
969 	unsigned int i;
970 
971 	for (i = 0; i < q->clhash.hashsize; i++) {
972 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
973 			if (cl->level)
974 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
975 			else {
976 				if (cl->un.leaf.q)
977 					qdisc_reset(cl->un.leaf.q);
978 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
979 			}
980 			cl->prio_activity = 0;
981 			cl->cmode = HTB_CAN_SEND;
982 
983 		}
984 	}
985 	qdisc_watchdog_cancel(&q->watchdog);
986 	__skb_queue_purge(&q->direct_queue);
987 	sch->q.qlen = 0;
988 	memset(q->hlevel, 0, sizeof(q->hlevel));
989 	memset(q->row_mask, 0, sizeof(q->row_mask));
990 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
991 		INIT_LIST_HEAD(q->drops + i);
992 }
993 
994 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
995 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
996 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
997 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
998 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
999 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1000 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1001 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1002 };
1003 
1004 static void htb_work_func(struct work_struct *work)
1005 {
1006 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1007 	struct Qdisc *sch = q->watchdog.qdisc;
1008 
1009 	__netif_schedule(qdisc_root(sch));
1010 }
1011 
1012 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1013 {
1014 	struct htb_sched *q = qdisc_priv(sch);
1015 	struct nlattr *tb[TCA_HTB_MAX + 1];
1016 	struct tc_htb_glob *gopt;
1017 	int err;
1018 	int i;
1019 
1020 	if (!opt)
1021 		return -EINVAL;
1022 
1023 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1024 	if (err < 0)
1025 		return err;
1026 
1027 	if (!tb[TCA_HTB_INIT])
1028 		return -EINVAL;
1029 
1030 	gopt = nla_data(tb[TCA_HTB_INIT]);
1031 	if (gopt->version != HTB_VER >> 16)
1032 		return -EINVAL;
1033 
1034 	err = qdisc_class_hash_init(&q->clhash);
1035 	if (err < 0)
1036 		return err;
1037 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1038 		INIT_LIST_HEAD(q->drops + i);
1039 
1040 	qdisc_watchdog_init(&q->watchdog, sch);
1041 	INIT_WORK(&q->work, htb_work_func);
1042 	skb_queue_head_init(&q->direct_queue);
1043 
1044 	if (tb[TCA_HTB_DIRECT_QLEN])
1045 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1046 	else {
1047 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1048 		if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1049 			q->direct_qlen = 2;
1050 	}
1051 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1052 		q->rate2quantum = 1;
1053 	q->defcls = gopt->defcls;
1054 
1055 	return 0;
1056 }
1057 
1058 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1059 {
1060 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1061 	struct htb_sched *q = qdisc_priv(sch);
1062 	struct nlattr *nest;
1063 	struct tc_htb_glob gopt;
1064 
1065 	spin_lock_bh(root_lock);
1066 
1067 	gopt.direct_pkts = q->direct_pkts;
1068 	gopt.version = HTB_VER;
1069 	gopt.rate2quantum = q->rate2quantum;
1070 	gopt.defcls = q->defcls;
1071 	gopt.debug = 0;
1072 
1073 	nest = nla_nest_start(skb, TCA_OPTIONS);
1074 	if (nest == NULL)
1075 		goto nla_put_failure;
1076 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1077 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1078 		goto nla_put_failure;
1079 	nla_nest_end(skb, nest);
1080 
1081 	spin_unlock_bh(root_lock);
1082 	return skb->len;
1083 
1084 nla_put_failure:
1085 	spin_unlock_bh(root_lock);
1086 	nla_nest_cancel(skb, nest);
1087 	return -1;
1088 }
1089 
1090 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1091 			  struct sk_buff *skb, struct tcmsg *tcm)
1092 {
1093 	struct htb_class *cl = (struct htb_class *)arg;
1094 	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1095 	struct nlattr *nest;
1096 	struct tc_htb_opt opt;
1097 
1098 	spin_lock_bh(root_lock);
1099 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1100 	tcm->tcm_handle = cl->common.classid;
1101 	if (!cl->level && cl->un.leaf.q)
1102 		tcm->tcm_info = cl->un.leaf.q->handle;
1103 
1104 	nest = nla_nest_start(skb, TCA_OPTIONS);
1105 	if (nest == NULL)
1106 		goto nla_put_failure;
1107 
1108 	memset(&opt, 0, sizeof(opt));
1109 
1110 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1111 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1112 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1113 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1114 	opt.quantum = cl->quantum;
1115 	opt.prio = cl->prio;
1116 	opt.level = cl->level;
1117 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1118 		goto nla_put_failure;
1119 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1120 	    nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps))
1121 		goto nla_put_failure;
1122 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1123 	    nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps))
1124 		goto nla_put_failure;
1125 
1126 	nla_nest_end(skb, nest);
1127 	spin_unlock_bh(root_lock);
1128 	return skb->len;
1129 
1130 nla_put_failure:
1131 	spin_unlock_bh(root_lock);
1132 	nla_nest_cancel(skb, nest);
1133 	return -1;
1134 }
1135 
1136 static int
1137 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1138 {
1139 	struct htb_class *cl = (struct htb_class *)arg;
1140 
1141 	if (!cl->level && cl->un.leaf.q)
1142 		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1143 	cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens);
1144 	cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens);
1145 
1146 	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1147 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1148 	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1149 		return -1;
1150 
1151 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1152 }
1153 
1154 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1155 		     struct Qdisc **old)
1156 {
1157 	struct htb_class *cl = (struct htb_class *)arg;
1158 
1159 	if (cl->level)
1160 		return -EINVAL;
1161 	if (new == NULL &&
1162 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1163 				     cl->common.classid)) == NULL)
1164 		return -ENOBUFS;
1165 
1166 	sch_tree_lock(sch);
1167 	*old = cl->un.leaf.q;
1168 	cl->un.leaf.q = new;
1169 	if (*old != NULL) {
1170 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1171 		qdisc_reset(*old);
1172 	}
1173 	sch_tree_unlock(sch);
1174 	return 0;
1175 }
1176 
1177 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1178 {
1179 	struct htb_class *cl = (struct htb_class *)arg;
1180 	return !cl->level ? cl->un.leaf.q : NULL;
1181 }
1182 
1183 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1184 {
1185 	struct htb_class *cl = (struct htb_class *)arg;
1186 
1187 	if (cl->un.leaf.q->q.qlen == 0)
1188 		htb_deactivate(qdisc_priv(sch), cl);
1189 }
1190 
1191 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1192 {
1193 	struct htb_class *cl = htb_find(classid, sch);
1194 	if (cl)
1195 		cl->refcnt++;
1196 	return (unsigned long)cl;
1197 }
1198 
1199 static inline int htb_parent_last_child(struct htb_class *cl)
1200 {
1201 	if (!cl->parent)
1202 		/* the root class */
1203 		return 0;
1204 	if (cl->parent->children > 1)
1205 		/* not the last child */
1206 		return 0;
1207 	return 1;
1208 }
1209 
1210 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1211 			       struct Qdisc *new_q)
1212 {
1213 	struct htb_class *parent = cl->parent;
1214 
1215 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1216 
1217 	if (parent->cmode != HTB_CAN_SEND)
1218 		htb_safe_rb_erase(&parent->pq_node,
1219 				  &q->hlevel[parent->level].wait_pq);
1220 
1221 	parent->level = 0;
1222 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1223 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1224 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1225 	parent->tokens = parent->buffer;
1226 	parent->ctokens = parent->cbuffer;
1227 	parent->t_c = ktime_to_ns(ktime_get());
1228 	parent->cmode = HTB_CAN_SEND;
1229 }
1230 
1231 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1232 {
1233 	if (!cl->level) {
1234 		WARN_ON(!cl->un.leaf.q);
1235 		qdisc_destroy(cl->un.leaf.q);
1236 	}
1237 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1238 	tcf_destroy_chain(&cl->filter_list);
1239 	kfree(cl);
1240 }
1241 
1242 static void htb_destroy(struct Qdisc *sch)
1243 {
1244 	struct htb_sched *q = qdisc_priv(sch);
1245 	struct hlist_node *next;
1246 	struct htb_class *cl;
1247 	unsigned int i;
1248 
1249 	cancel_work_sync(&q->work);
1250 	qdisc_watchdog_cancel(&q->watchdog);
1251 	/* This line used to be after htb_destroy_class call below
1252 	 * and surprisingly it worked in 2.4. But it must precede it
1253 	 * because filter need its target class alive to be able to call
1254 	 * unbind_filter on it (without Oops).
1255 	 */
1256 	tcf_destroy_chain(&q->filter_list);
1257 
1258 	for (i = 0; i < q->clhash.hashsize; i++) {
1259 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1260 			tcf_destroy_chain(&cl->filter_list);
1261 	}
1262 	for (i = 0; i < q->clhash.hashsize; i++) {
1263 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1264 					  common.hnode)
1265 			htb_destroy_class(sch, cl);
1266 	}
1267 	qdisc_class_hash_destroy(&q->clhash);
1268 	__skb_queue_purge(&q->direct_queue);
1269 }
1270 
1271 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1272 {
1273 	struct htb_sched *q = qdisc_priv(sch);
1274 	struct htb_class *cl = (struct htb_class *)arg;
1275 	unsigned int qlen;
1276 	struct Qdisc *new_q = NULL;
1277 	int last_child = 0;
1278 
1279 	// TODO: why don't allow to delete subtree ? references ? does
1280 	// tc subsys quarantee us that in htb_destroy it holds no class
1281 	// refs so that we can remove children safely there ?
1282 	if (cl->children || cl->filter_cnt)
1283 		return -EBUSY;
1284 
1285 	if (!cl->level && htb_parent_last_child(cl)) {
1286 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1287 					  cl->parent->common.classid);
1288 		last_child = 1;
1289 	}
1290 
1291 	sch_tree_lock(sch);
1292 
1293 	if (!cl->level) {
1294 		qlen = cl->un.leaf.q->q.qlen;
1295 		qdisc_reset(cl->un.leaf.q);
1296 		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1297 	}
1298 
1299 	/* delete from hash and active; remainder in destroy_class */
1300 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1301 	if (cl->parent)
1302 		cl->parent->children--;
1303 
1304 	if (cl->prio_activity)
1305 		htb_deactivate(q, cl);
1306 
1307 	if (cl->cmode != HTB_CAN_SEND)
1308 		htb_safe_rb_erase(&cl->pq_node,
1309 				  &q->hlevel[cl->level].wait_pq);
1310 
1311 	if (last_child)
1312 		htb_parent_to_leaf(q, cl, new_q);
1313 
1314 	BUG_ON(--cl->refcnt == 0);
1315 	/*
1316 	 * This shouldn't happen: we "hold" one cops->get() when called
1317 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1318 	 */
1319 
1320 	sch_tree_unlock(sch);
1321 	return 0;
1322 }
1323 
1324 static void htb_put(struct Qdisc *sch, unsigned long arg)
1325 {
1326 	struct htb_class *cl = (struct htb_class *)arg;
1327 
1328 	if (--cl->refcnt == 0)
1329 		htb_destroy_class(sch, cl);
1330 }
1331 
1332 static int htb_change_class(struct Qdisc *sch, u32 classid,
1333 			    u32 parentid, struct nlattr **tca,
1334 			    unsigned long *arg)
1335 {
1336 	int err = -EINVAL;
1337 	struct htb_sched *q = qdisc_priv(sch);
1338 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1339 	struct nlattr *opt = tca[TCA_OPTIONS];
1340 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1341 	struct nlattr *tb[TCA_HTB_MAX + 1];
1342 	struct tc_htb_opt *hopt;
1343 	u64 rate64, ceil64;
1344 
1345 	/* extract all subattrs from opt attr */
1346 	if (!opt)
1347 		goto failure;
1348 
1349 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1350 	if (err < 0)
1351 		goto failure;
1352 
1353 	err = -EINVAL;
1354 	if (tb[TCA_HTB_PARMS] == NULL)
1355 		goto failure;
1356 
1357 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1358 
1359 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1360 	if (!hopt->rate.rate || !hopt->ceil.rate)
1361 		goto failure;
1362 
1363 	/* Keeping backward compatible with rate_table based iproute2 tc */
1364 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) {
1365 		rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1366 		if (rtab)
1367 			qdisc_put_rtab(rtab);
1368 	}
1369 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE) {
1370 		ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1371 		if (ctab)
1372 			qdisc_put_rtab(ctab);
1373 	}
1374 
1375 	if (!cl) {		/* new class */
1376 		struct Qdisc *new_q;
1377 		int prio;
1378 		struct {
1379 			struct nlattr		nla;
1380 			struct gnet_estimator	opt;
1381 		} est = {
1382 			.nla = {
1383 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1384 				.nla_type	= TCA_RATE,
1385 			},
1386 			.opt = {
1387 				/* 4s interval, 16s averaging constant */
1388 				.interval	= 2,
1389 				.ewma_log	= 2,
1390 			},
1391 		};
1392 
1393 		/* check for valid classid */
1394 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1395 		    htb_find(classid, sch))
1396 			goto failure;
1397 
1398 		/* check maximal depth */
1399 		if (parent && parent->parent && parent->parent->level < 2) {
1400 			pr_err("htb: tree is too deep\n");
1401 			goto failure;
1402 		}
1403 		err = -ENOBUFS;
1404 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1405 		if (!cl)
1406 			goto failure;
1407 
1408 		if (htb_rate_est || tca[TCA_RATE]) {
1409 			err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1410 						qdisc_root_sleeping_lock(sch),
1411 						tca[TCA_RATE] ? : &est.nla);
1412 			if (err) {
1413 				kfree(cl);
1414 				goto failure;
1415 			}
1416 		}
1417 
1418 		cl->refcnt = 1;
1419 		cl->children = 0;
1420 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1421 		RB_CLEAR_NODE(&cl->pq_node);
1422 
1423 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1424 			RB_CLEAR_NODE(&cl->node[prio]);
1425 
1426 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1427 		 * so that can't be used inside of sch_tree_lock
1428 		 * -- thanks to Karlis Peisenieks
1429 		 */
1430 		new_q = qdisc_create_dflt(sch->dev_queue,
1431 					  &pfifo_qdisc_ops, classid);
1432 		sch_tree_lock(sch);
1433 		if (parent && !parent->level) {
1434 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1435 
1436 			/* turn parent into inner node */
1437 			qdisc_reset(parent->un.leaf.q);
1438 			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1439 			qdisc_destroy(parent->un.leaf.q);
1440 			if (parent->prio_activity)
1441 				htb_deactivate(q, parent);
1442 
1443 			/* remove from evt list because of level change */
1444 			if (parent->cmode != HTB_CAN_SEND) {
1445 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1446 				parent->cmode = HTB_CAN_SEND;
1447 			}
1448 			parent->level = (parent->parent ? parent->parent->level
1449 					 : TC_HTB_MAXDEPTH) - 1;
1450 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1451 		}
1452 		/* leaf (we) needs elementary qdisc */
1453 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1454 
1455 		cl->common.classid = classid;
1456 		cl->parent = parent;
1457 
1458 		/* set class to be in HTB_CAN_SEND state */
1459 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1460 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1461 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1462 		cl->t_c = ktime_to_ns(ktime_get());
1463 		cl->cmode = HTB_CAN_SEND;
1464 
1465 		/* attach to the hash list and parent's family */
1466 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1467 		if (parent)
1468 			parent->children++;
1469 	} else {
1470 		if (tca[TCA_RATE]) {
1471 			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1472 						    qdisc_root_sleeping_lock(sch),
1473 						    tca[TCA_RATE]);
1474 			if (err)
1475 				return err;
1476 		}
1477 		sch_tree_lock(sch);
1478 	}
1479 
1480 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1481 
1482 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1483 
1484 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1485 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1486 
1487 	/* it used to be a nasty bug here, we have to check that node
1488 	 * is really leaf before changing cl->un.leaf !
1489 	 */
1490 	if (!cl->level) {
1491 		u64 quantum = cl->rate.rate_bytes_ps;
1492 
1493 		do_div(quantum, q->rate2quantum);
1494 		cl->quantum = min_t(u64, quantum, INT_MAX);
1495 
1496 		if (!hopt->quantum && cl->quantum < 1000) {
1497 			pr_warning(
1498 			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1499 			       cl->common.classid);
1500 			cl->quantum = 1000;
1501 		}
1502 		if (!hopt->quantum && cl->quantum > 200000) {
1503 			pr_warning(
1504 			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1505 			       cl->common.classid);
1506 			cl->quantum = 200000;
1507 		}
1508 		if (hopt->quantum)
1509 			cl->quantum = hopt->quantum;
1510 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1511 			cl->prio = TC_HTB_NUMPRIO - 1;
1512 	}
1513 
1514 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1515 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1516 
1517 	sch_tree_unlock(sch);
1518 
1519 	qdisc_class_hash_grow(sch, &q->clhash);
1520 
1521 	*arg = (unsigned long)cl;
1522 	return 0;
1523 
1524 failure:
1525 	return err;
1526 }
1527 
1528 static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1529 {
1530 	struct htb_sched *q = qdisc_priv(sch);
1531 	struct htb_class *cl = (struct htb_class *)arg;
1532 	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1533 
1534 	return fl;
1535 }
1536 
1537 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1538 				     u32 classid)
1539 {
1540 	struct htb_class *cl = htb_find(classid, sch);
1541 
1542 	/*if (cl && !cl->level) return 0;
1543 	 * The line above used to be there to prevent attaching filters to
1544 	 * leaves. But at least tc_index filter uses this just to get class
1545 	 * for other reasons so that we have to allow for it.
1546 	 * ----
1547 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1548 	 * another way to "lock" the class - unlike "get" this lock can
1549 	 * be broken by class during destroy IIUC.
1550 	 */
1551 	if (cl)
1552 		cl->filter_cnt++;
1553 	return (unsigned long)cl;
1554 }
1555 
1556 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1557 {
1558 	struct htb_class *cl = (struct htb_class *)arg;
1559 
1560 	if (cl)
1561 		cl->filter_cnt--;
1562 }
1563 
1564 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1565 {
1566 	struct htb_sched *q = qdisc_priv(sch);
1567 	struct htb_class *cl;
1568 	unsigned int i;
1569 
1570 	if (arg->stop)
1571 		return;
1572 
1573 	for (i = 0; i < q->clhash.hashsize; i++) {
1574 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1575 			if (arg->count < arg->skip) {
1576 				arg->count++;
1577 				continue;
1578 			}
1579 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1580 				arg->stop = 1;
1581 				return;
1582 			}
1583 			arg->count++;
1584 		}
1585 	}
1586 }
1587 
1588 static const struct Qdisc_class_ops htb_class_ops = {
1589 	.graft		=	htb_graft,
1590 	.leaf		=	htb_leaf,
1591 	.qlen_notify	=	htb_qlen_notify,
1592 	.get		=	htb_get,
1593 	.put		=	htb_put,
1594 	.change		=	htb_change_class,
1595 	.delete		=	htb_delete,
1596 	.walk		=	htb_walk,
1597 	.tcf_chain	=	htb_find_tcf,
1598 	.bind_tcf	=	htb_bind_filter,
1599 	.unbind_tcf	=	htb_unbind_filter,
1600 	.dump		=	htb_dump_class,
1601 	.dump_stats	=	htb_dump_class_stats,
1602 };
1603 
1604 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1605 	.cl_ops		=	&htb_class_ops,
1606 	.id		=	"htb",
1607 	.priv_size	=	sizeof(struct htb_sched),
1608 	.enqueue	=	htb_enqueue,
1609 	.dequeue	=	htb_dequeue,
1610 	.peek		=	qdisc_peek_dequeued,
1611 	.drop		=	htb_drop,
1612 	.init		=	htb_init,
1613 	.reset		=	htb_reset,
1614 	.destroy	=	htb_destroy,
1615 	.dump		=	htb_dump,
1616 	.owner		=	THIS_MODULE,
1617 };
1618 
1619 static int __init htb_module_init(void)
1620 {
1621 	return register_qdisc(&htb_qdisc_ops);
1622 }
1623 static void __exit htb_module_exit(void)
1624 {
1625 	unregister_qdisc(&htb_qdisc_ops);
1626 }
1627 
1628 module_init(htb_module_init)
1629 module_exit(htb_module_exit)
1630 MODULE_LICENSE("GPL");
1631