xref: /openbmc/linux/net/sched/sch_htb.c (revision bc5aa3a0)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 
44 /* HTB algorithm.
45     Author: devik@cdi.cz
46     ========================================================================
47     HTB is like TBF with multiple classes. It is also similar to CBQ because
48     it allows to assign priority to each class in hierarchy.
49     In fact it is another implementation of Floyd's formal sharing.
50 
51     Levels:
52     Each class is assigned level. Leaf has ALWAYS level 0 and root
53     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
54     one less than their parent.
55 */
56 
57 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
58 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
59 
60 #if HTB_VER >> 16 != TC_HTB_PROTOVER
61 #error "Mismatched sch_htb.c and pkt_sch.h"
62 #endif
63 
64 /* Module parameter and sysfs export */
65 module_param    (htb_hysteresis, int, 0640);
66 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
67 
68 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
69 module_param(htb_rate_est, int, 0640);
70 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
71 
72 /* used internaly to keep status of single class */
73 enum htb_cmode {
74 	HTB_CANT_SEND,		/* class can't send and can't borrow */
75 	HTB_MAY_BORROW,		/* class can't send but may borrow */
76 	HTB_CAN_SEND		/* class can send */
77 };
78 
79 struct htb_prio {
80 	union {
81 		struct rb_root	row;
82 		struct rb_root	feed;
83 	};
84 	struct rb_node	*ptr;
85 	/* When class changes from state 1->2 and disconnects from
86 	 * parent's feed then we lost ptr value and start from the
87 	 * first child again. Here we store classid of the
88 	 * last valid ptr (used when ptr is NULL).
89 	 */
90 	u32		last_ptr_id;
91 };
92 
93 /* interior & leaf nodes; props specific to leaves are marked L:
94  * To reduce false sharing, place mostly read fields at beginning,
95  * and mostly written ones at the end.
96  */
97 struct htb_class {
98 	struct Qdisc_class_common common;
99 	struct psched_ratecfg	rate;
100 	struct psched_ratecfg	ceil;
101 	s64			buffer, cbuffer;/* token bucket depth/rate */
102 	s64			mbuffer;	/* max wait time */
103 	u32			prio;		/* these two are used only by leaves... */
104 	int			quantum;	/* but stored for parent-to-leaf return */
105 
106 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
107 	int			filter_cnt;
108 	int			refcnt;		/* usage count of this class */
109 
110 	int			level;		/* our level (see above) */
111 	unsigned int		children;
112 	struct htb_class	*parent;	/* parent class */
113 
114 	struct gnet_stats_rate_est64 rate_est;
115 
116 	/*
117 	 * Written often fields
118 	 */
119 	struct gnet_stats_basic_packed bstats;
120 	struct tc_htb_xstats	xstats;	/* our special stats */
121 
122 	/* token bucket parameters */
123 	s64			tokens, ctokens;/* current number of tokens */
124 	s64			t_c;		/* checkpoint time */
125 
126 	union {
127 		struct htb_class_leaf {
128 			struct list_head drop_list;
129 			int		deficit[TC_HTB_MAXDEPTH];
130 			struct Qdisc	*q;
131 		} leaf;
132 		struct htb_class_inner {
133 			struct htb_prio clprio[TC_HTB_NUMPRIO];
134 		} inner;
135 	} un;
136 	s64			pq_key;
137 
138 	int			prio_activity;	/* for which prios are we active */
139 	enum htb_cmode		cmode;		/* current mode of the class */
140 	struct rb_node		pq_node;	/* node for event queue */
141 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
142 
143 	unsigned int drops ____cacheline_aligned_in_smp;
144 };
145 
146 struct htb_level {
147 	struct rb_root	wait_pq;
148 	struct htb_prio hprio[TC_HTB_NUMPRIO];
149 };
150 
151 struct htb_sched {
152 	struct Qdisc_class_hash clhash;
153 	int			defcls;		/* class where unclassified flows go to */
154 	int			rate2quantum;	/* quant = rate / rate2quantum */
155 
156 	/* filters for qdisc itself */
157 	struct tcf_proto __rcu	*filter_list;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct sk_buff_head	direct_queue;
166 	long			direct_pkts;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 };
180 
181 /* find class in global hash table using given handle */
182 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
183 {
184 	struct htb_sched *q = qdisc_priv(sch);
185 	struct Qdisc_class_common *clc;
186 
187 	clc = qdisc_class_find(&q->clhash, handle);
188 	if (clc == NULL)
189 		return NULL;
190 	return container_of(clc, struct htb_class, common);
191 }
192 
193 /**
194  * htb_classify - classify a packet into class
195  *
196  * It returns NULL if the packet should be dropped or -1 if the packet
197  * should be passed directly thru. In all other cases leaf class is returned.
198  * We allow direct class selection by classid in priority. The we examine
199  * filters in qdisc and in inner nodes (if higher filter points to the inner
200  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
201  * internal fifo (direct). These packets then go directly thru. If we still
202  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
203  * then finish and return direct queue.
204  */
205 #define HTB_DIRECT ((struct htb_class *)-1L)
206 
207 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
208 				      int *qerr)
209 {
210 	struct htb_sched *q = qdisc_priv(sch);
211 	struct htb_class *cl;
212 	struct tcf_result res;
213 	struct tcf_proto *tcf;
214 	int result;
215 
216 	/* allow to select class by setting skb->priority to valid classid;
217 	 * note that nfmark can be used too by attaching filter fw with no
218 	 * rules in it
219 	 */
220 	if (skb->priority == sch->handle)
221 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
222 	cl = htb_find(skb->priority, sch);
223 	if (cl) {
224 		if (cl->level == 0)
225 			return cl;
226 		/* Start with inner filter chain if a non-leaf class is selected */
227 		tcf = rcu_dereference_bh(cl->filter_list);
228 	} else {
229 		tcf = rcu_dereference_bh(q->filter_list);
230 	}
231 
232 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
233 	while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
234 #ifdef CONFIG_NET_CLS_ACT
235 		switch (result) {
236 		case TC_ACT_QUEUED:
237 		case TC_ACT_STOLEN:
238 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
239 		case TC_ACT_SHOT:
240 			return NULL;
241 		}
242 #endif
243 		cl = (void *)res.class;
244 		if (!cl) {
245 			if (res.classid == sch->handle)
246 				return HTB_DIRECT;	/* X:0 (direct flow) */
247 			cl = htb_find(res.classid, sch);
248 			if (!cl)
249 				break;	/* filter selected invalid classid */
250 		}
251 		if (!cl->level)
252 			return cl;	/* we hit leaf; return it */
253 
254 		/* we have got inner class; apply inner filter chain */
255 		tcf = rcu_dereference_bh(cl->filter_list);
256 	}
257 	/* classification failed; try to use default class */
258 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
259 	if (!cl || cl->level)
260 		return HTB_DIRECT;	/* bad default .. this is safe bet */
261 	return cl;
262 }
263 
264 /**
265  * htb_add_to_id_tree - adds class to the round robin list
266  *
267  * Routine adds class to the list (actually tree) sorted by classid.
268  * Make sure that class is not already on such list for given prio.
269  */
270 static void htb_add_to_id_tree(struct rb_root *root,
271 			       struct htb_class *cl, int prio)
272 {
273 	struct rb_node **p = &root->rb_node, *parent = NULL;
274 
275 	while (*p) {
276 		struct htb_class *c;
277 		parent = *p;
278 		c = rb_entry(parent, struct htb_class, node[prio]);
279 
280 		if (cl->common.classid > c->common.classid)
281 			p = &parent->rb_right;
282 		else
283 			p = &parent->rb_left;
284 	}
285 	rb_link_node(&cl->node[prio], parent, p);
286 	rb_insert_color(&cl->node[prio], root);
287 }
288 
289 /**
290  * htb_add_to_wait_tree - adds class to the event queue with delay
291  *
292  * The class is added to priority event queue to indicate that class will
293  * change its mode in cl->pq_key microseconds. Make sure that class is not
294  * already in the queue.
295  */
296 static void htb_add_to_wait_tree(struct htb_sched *q,
297 				 struct htb_class *cl, s64 delay)
298 {
299 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
300 
301 	cl->pq_key = q->now + delay;
302 	if (cl->pq_key == q->now)
303 		cl->pq_key++;
304 
305 	/* update the nearest event cache */
306 	if (q->near_ev_cache[cl->level] > cl->pq_key)
307 		q->near_ev_cache[cl->level] = cl->pq_key;
308 
309 	while (*p) {
310 		struct htb_class *c;
311 		parent = *p;
312 		c = rb_entry(parent, struct htb_class, pq_node);
313 		if (cl->pq_key >= c->pq_key)
314 			p = &parent->rb_right;
315 		else
316 			p = &parent->rb_left;
317 	}
318 	rb_link_node(&cl->pq_node, parent, p);
319 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
320 }
321 
322 /**
323  * htb_next_rb_node - finds next node in binary tree
324  *
325  * When we are past last key we return NULL.
326  * Average complexity is 2 steps per call.
327  */
328 static inline void htb_next_rb_node(struct rb_node **n)
329 {
330 	*n = rb_next(*n);
331 }
332 
333 /**
334  * htb_add_class_to_row - add class to its row
335  *
336  * The class is added to row at priorities marked in mask.
337  * It does nothing if mask == 0.
338  */
339 static inline void htb_add_class_to_row(struct htb_sched *q,
340 					struct htb_class *cl, int mask)
341 {
342 	q->row_mask[cl->level] |= mask;
343 	while (mask) {
344 		int prio = ffz(~mask);
345 		mask &= ~(1 << prio);
346 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
347 	}
348 }
349 
350 /* If this triggers, it is a bug in this code, but it need not be fatal */
351 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
352 {
353 	if (RB_EMPTY_NODE(rb)) {
354 		WARN_ON(1);
355 	} else {
356 		rb_erase(rb, root);
357 		RB_CLEAR_NODE(rb);
358 	}
359 }
360 
361 
362 /**
363  * htb_remove_class_from_row - removes class from its row
364  *
365  * The class is removed from row at priorities marked in mask.
366  * It does nothing if mask == 0.
367  */
368 static inline void htb_remove_class_from_row(struct htb_sched *q,
369 						 struct htb_class *cl, int mask)
370 {
371 	int m = 0;
372 	struct htb_level *hlevel = &q->hlevel[cl->level];
373 
374 	while (mask) {
375 		int prio = ffz(~mask);
376 		struct htb_prio *hprio = &hlevel->hprio[prio];
377 
378 		mask &= ~(1 << prio);
379 		if (hprio->ptr == cl->node + prio)
380 			htb_next_rb_node(&hprio->ptr);
381 
382 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
383 		if (!hprio->row.rb_node)
384 			m |= 1 << prio;
385 	}
386 	q->row_mask[cl->level] &= ~m;
387 }
388 
389 /**
390  * htb_activate_prios - creates active classe's feed chain
391  *
392  * The class is connected to ancestors and/or appropriate rows
393  * for priorities it is participating on. cl->cmode must be new
394  * (activated) mode. It does nothing if cl->prio_activity == 0.
395  */
396 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
397 {
398 	struct htb_class *p = cl->parent;
399 	long m, mask = cl->prio_activity;
400 
401 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
402 		m = mask;
403 		while (m) {
404 			int prio = ffz(~m);
405 			m &= ~(1 << prio);
406 
407 			if (p->un.inner.clprio[prio].feed.rb_node)
408 				/* parent already has its feed in use so that
409 				 * reset bit in mask as parent is already ok
410 				 */
411 				mask &= ~(1 << prio);
412 
413 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
414 		}
415 		p->prio_activity |= mask;
416 		cl = p;
417 		p = cl->parent;
418 
419 	}
420 	if (cl->cmode == HTB_CAN_SEND && mask)
421 		htb_add_class_to_row(q, cl, mask);
422 }
423 
424 /**
425  * htb_deactivate_prios - remove class from feed chain
426  *
427  * cl->cmode must represent old mode (before deactivation). It does
428  * nothing if cl->prio_activity == 0. Class is removed from all feed
429  * chains and rows.
430  */
431 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
432 {
433 	struct htb_class *p = cl->parent;
434 	long m, mask = cl->prio_activity;
435 
436 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
437 		m = mask;
438 		mask = 0;
439 		while (m) {
440 			int prio = ffz(~m);
441 			m &= ~(1 << prio);
442 
443 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
444 				/* we are removing child which is pointed to from
445 				 * parent feed - forget the pointer but remember
446 				 * classid
447 				 */
448 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
449 				p->un.inner.clprio[prio].ptr = NULL;
450 			}
451 
452 			htb_safe_rb_erase(cl->node + prio,
453 					  &p->un.inner.clprio[prio].feed);
454 
455 			if (!p->un.inner.clprio[prio].feed.rb_node)
456 				mask |= 1 << prio;
457 		}
458 
459 		p->prio_activity &= ~mask;
460 		cl = p;
461 		p = cl->parent;
462 
463 	}
464 	if (cl->cmode == HTB_CAN_SEND && mask)
465 		htb_remove_class_from_row(q, cl, mask);
466 }
467 
468 static inline s64 htb_lowater(const struct htb_class *cl)
469 {
470 	if (htb_hysteresis)
471 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
472 	else
473 		return 0;
474 }
475 static inline s64 htb_hiwater(const struct htb_class *cl)
476 {
477 	if (htb_hysteresis)
478 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
479 	else
480 		return 0;
481 }
482 
483 
484 /**
485  * htb_class_mode - computes and returns current class mode
486  *
487  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
488  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
489  * from now to time when cl will change its state.
490  * Also it is worth to note that class mode doesn't change simply
491  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
492  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
493  * mode transitions per time unit. The speed gain is about 1/6.
494  */
495 static inline enum htb_cmode
496 htb_class_mode(struct htb_class *cl, s64 *diff)
497 {
498 	s64 toks;
499 
500 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
501 		*diff = -toks;
502 		return HTB_CANT_SEND;
503 	}
504 
505 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
506 		return HTB_CAN_SEND;
507 
508 	*diff = -toks;
509 	return HTB_MAY_BORROW;
510 }
511 
512 /**
513  * htb_change_class_mode - changes classe's mode
514  *
515  * This should be the only way how to change classe's mode under normal
516  * cirsumstances. Routine will update feed lists linkage, change mode
517  * and add class to the wait event queue if appropriate. New mode should
518  * be different from old one and cl->pq_key has to be valid if changing
519  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
520  */
521 static void
522 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
523 {
524 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
525 
526 	if (new_mode == cl->cmode)
527 		return;
528 
529 	if (cl->prio_activity) {	/* not necessary: speed optimization */
530 		if (cl->cmode != HTB_CANT_SEND)
531 			htb_deactivate_prios(q, cl);
532 		cl->cmode = new_mode;
533 		if (new_mode != HTB_CANT_SEND)
534 			htb_activate_prios(q, cl);
535 	} else
536 		cl->cmode = new_mode;
537 }
538 
539 /**
540  * htb_activate - inserts leaf cl into appropriate active feeds
541  *
542  * Routine learns (new) priority of leaf and activates feed chain
543  * for the prio. It can be called on already active leaf safely.
544  * It also adds leaf into droplist.
545  */
546 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
547 {
548 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
549 
550 	if (!cl->prio_activity) {
551 		cl->prio_activity = 1 << cl->prio;
552 		htb_activate_prios(q, cl);
553 		list_add_tail(&cl->un.leaf.drop_list,
554 			      q->drops + cl->prio);
555 	}
556 }
557 
558 /**
559  * htb_deactivate - remove leaf cl from active feeds
560  *
561  * Make sure that leaf is active. In the other words it can't be called
562  * with non-active leaf. It also removes class from the drop list.
563  */
564 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
565 {
566 	WARN_ON(!cl->prio_activity);
567 
568 	htb_deactivate_prios(q, cl);
569 	cl->prio_activity = 0;
570 	list_del_init(&cl->un.leaf.drop_list);
571 }
572 
573 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
574 		       struct sk_buff **to_free)
575 {
576 	int uninitialized_var(ret);
577 	struct htb_sched *q = qdisc_priv(sch);
578 	struct htb_class *cl = htb_classify(skb, sch, &ret);
579 
580 	if (cl == HTB_DIRECT) {
581 		/* enqueue to helper queue */
582 		if (q->direct_queue.qlen < q->direct_qlen) {
583 			__skb_queue_tail(&q->direct_queue, skb);
584 			q->direct_pkts++;
585 		} else {
586 			return qdisc_drop(skb, sch, to_free);
587 		}
588 #ifdef CONFIG_NET_CLS_ACT
589 	} else if (!cl) {
590 		if (ret & __NET_XMIT_BYPASS)
591 			qdisc_qstats_drop(sch);
592 		__qdisc_drop(skb, to_free);
593 		return ret;
594 #endif
595 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
596 					to_free)) != NET_XMIT_SUCCESS) {
597 		if (net_xmit_drop_count(ret)) {
598 			qdisc_qstats_drop(sch);
599 			cl->drops++;
600 		}
601 		return ret;
602 	} else {
603 		htb_activate(q, cl);
604 	}
605 
606 	qdisc_qstats_backlog_inc(sch, skb);
607 	sch->q.qlen++;
608 	return NET_XMIT_SUCCESS;
609 }
610 
611 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
612 {
613 	s64 toks = diff + cl->tokens;
614 
615 	if (toks > cl->buffer)
616 		toks = cl->buffer;
617 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
618 	if (toks <= -cl->mbuffer)
619 		toks = 1 - cl->mbuffer;
620 
621 	cl->tokens = toks;
622 }
623 
624 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
625 {
626 	s64 toks = diff + cl->ctokens;
627 
628 	if (toks > cl->cbuffer)
629 		toks = cl->cbuffer;
630 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
631 	if (toks <= -cl->mbuffer)
632 		toks = 1 - cl->mbuffer;
633 
634 	cl->ctokens = toks;
635 }
636 
637 /**
638  * htb_charge_class - charges amount "bytes" to leaf and ancestors
639  *
640  * Routine assumes that packet "bytes" long was dequeued from leaf cl
641  * borrowing from "level". It accounts bytes to ceil leaky bucket for
642  * leaf and all ancestors and to rate bucket for ancestors at levels
643  * "level" and higher. It also handles possible change of mode resulting
644  * from the update. Note that mode can also increase here (MAY_BORROW to
645  * CAN_SEND) because we can use more precise clock that event queue here.
646  * In such case we remove class from event queue first.
647  */
648 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
649 			     int level, struct sk_buff *skb)
650 {
651 	int bytes = qdisc_pkt_len(skb);
652 	enum htb_cmode old_mode;
653 	s64 diff;
654 
655 	while (cl) {
656 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
657 		if (cl->level >= level) {
658 			if (cl->level == level)
659 				cl->xstats.lends++;
660 			htb_accnt_tokens(cl, bytes, diff);
661 		} else {
662 			cl->xstats.borrows++;
663 			cl->tokens += diff;	/* we moved t_c; update tokens */
664 		}
665 		htb_accnt_ctokens(cl, bytes, diff);
666 		cl->t_c = q->now;
667 
668 		old_mode = cl->cmode;
669 		diff = 0;
670 		htb_change_class_mode(q, cl, &diff);
671 		if (old_mode != cl->cmode) {
672 			if (old_mode != HTB_CAN_SEND)
673 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
674 			if (cl->cmode != HTB_CAN_SEND)
675 				htb_add_to_wait_tree(q, cl, diff);
676 		}
677 
678 		/* update basic stats except for leaves which are already updated */
679 		if (cl->level)
680 			bstats_update(&cl->bstats, skb);
681 
682 		cl = cl->parent;
683 	}
684 }
685 
686 /**
687  * htb_do_events - make mode changes to classes at the level
688  *
689  * Scans event queue for pending events and applies them. Returns time of
690  * next pending event (0 for no event in pq, q->now for too many events).
691  * Note: Applied are events whose have cl->pq_key <= q->now.
692  */
693 static s64 htb_do_events(struct htb_sched *q, const int level,
694 			 unsigned long start)
695 {
696 	/* don't run for longer than 2 jiffies; 2 is used instead of
697 	 * 1 to simplify things when jiffy is going to be incremented
698 	 * too soon
699 	 */
700 	unsigned long stop_at = start + 2;
701 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
702 
703 	while (time_before(jiffies, stop_at)) {
704 		struct htb_class *cl;
705 		s64 diff;
706 		struct rb_node *p = rb_first(wait_pq);
707 
708 		if (!p)
709 			return 0;
710 
711 		cl = rb_entry(p, struct htb_class, pq_node);
712 		if (cl->pq_key > q->now)
713 			return cl->pq_key;
714 
715 		htb_safe_rb_erase(p, wait_pq);
716 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
717 		htb_change_class_mode(q, cl, &diff);
718 		if (cl->cmode != HTB_CAN_SEND)
719 			htb_add_to_wait_tree(q, cl, diff);
720 	}
721 
722 	/* too much load - let's continue after a break for scheduling */
723 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
724 		pr_warn("htb: too many events!\n");
725 		q->warned |= HTB_WARN_TOOMANYEVENTS;
726 	}
727 
728 	return q->now;
729 }
730 
731 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
732  * is no such one exists.
733  */
734 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
735 					      u32 id)
736 {
737 	struct rb_node *r = NULL;
738 	while (n) {
739 		struct htb_class *cl =
740 		    rb_entry(n, struct htb_class, node[prio]);
741 
742 		if (id > cl->common.classid) {
743 			n = n->rb_right;
744 		} else if (id < cl->common.classid) {
745 			r = n;
746 			n = n->rb_left;
747 		} else {
748 			return n;
749 		}
750 	}
751 	return r;
752 }
753 
754 /**
755  * htb_lookup_leaf - returns next leaf class in DRR order
756  *
757  * Find leaf where current feed pointers points to.
758  */
759 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
760 {
761 	int i;
762 	struct {
763 		struct rb_node *root;
764 		struct rb_node **pptr;
765 		u32 *pid;
766 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
767 
768 	BUG_ON(!hprio->row.rb_node);
769 	sp->root = hprio->row.rb_node;
770 	sp->pptr = &hprio->ptr;
771 	sp->pid = &hprio->last_ptr_id;
772 
773 	for (i = 0; i < 65535; i++) {
774 		if (!*sp->pptr && *sp->pid) {
775 			/* ptr was invalidated but id is valid - try to recover
776 			 * the original or next ptr
777 			 */
778 			*sp->pptr =
779 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
780 		}
781 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
782 				 * can become out of date quickly
783 				 */
784 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
785 			*sp->pptr = sp->root;
786 			while ((*sp->pptr)->rb_left)
787 				*sp->pptr = (*sp->pptr)->rb_left;
788 			if (sp > stk) {
789 				sp--;
790 				if (!*sp->pptr) {
791 					WARN_ON(1);
792 					return NULL;
793 				}
794 				htb_next_rb_node(sp->pptr);
795 			}
796 		} else {
797 			struct htb_class *cl;
798 			struct htb_prio *clp;
799 
800 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
801 			if (!cl->level)
802 				return cl;
803 			clp = &cl->un.inner.clprio[prio];
804 			(++sp)->root = clp->feed.rb_node;
805 			sp->pptr = &clp->ptr;
806 			sp->pid = &clp->last_ptr_id;
807 		}
808 	}
809 	WARN_ON(1);
810 	return NULL;
811 }
812 
813 /* dequeues packet at given priority and level; call only if
814  * you are sure that there is active class at prio/level
815  */
816 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
817 					const int level)
818 {
819 	struct sk_buff *skb = NULL;
820 	struct htb_class *cl, *start;
821 	struct htb_level *hlevel = &q->hlevel[level];
822 	struct htb_prio *hprio = &hlevel->hprio[prio];
823 
824 	/* look initial class up in the row */
825 	start = cl = htb_lookup_leaf(hprio, prio);
826 
827 	do {
828 next:
829 		if (unlikely(!cl))
830 			return NULL;
831 
832 		/* class can be empty - it is unlikely but can be true if leaf
833 		 * qdisc drops packets in enqueue routine or if someone used
834 		 * graft operation on the leaf since last dequeue;
835 		 * simply deactivate and skip such class
836 		 */
837 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
838 			struct htb_class *next;
839 			htb_deactivate(q, cl);
840 
841 			/* row/level might become empty */
842 			if ((q->row_mask[level] & (1 << prio)) == 0)
843 				return NULL;
844 
845 			next = htb_lookup_leaf(hprio, prio);
846 
847 			if (cl == start)	/* fix start if we just deleted it */
848 				start = next;
849 			cl = next;
850 			goto next;
851 		}
852 
853 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
854 		if (likely(skb != NULL))
855 			break;
856 
857 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
858 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
859 					 &q->hlevel[0].hprio[prio].ptr);
860 		cl = htb_lookup_leaf(hprio, prio);
861 
862 	} while (cl != start);
863 
864 	if (likely(skb != NULL)) {
865 		bstats_update(&cl->bstats, skb);
866 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
867 		if (cl->un.leaf.deficit[level] < 0) {
868 			cl->un.leaf.deficit[level] += cl->quantum;
869 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
870 						 &q->hlevel[0].hprio[prio].ptr);
871 		}
872 		/* this used to be after charge_class but this constelation
873 		 * gives us slightly better performance
874 		 */
875 		if (!cl->un.leaf.q->q.qlen)
876 			htb_deactivate(q, cl);
877 		htb_charge_class(q, cl, level, skb);
878 	}
879 	return skb;
880 }
881 
882 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
883 {
884 	struct sk_buff *skb;
885 	struct htb_sched *q = qdisc_priv(sch);
886 	int level;
887 	s64 next_event;
888 	unsigned long start_at;
889 
890 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
891 	skb = __skb_dequeue(&q->direct_queue);
892 	if (skb != NULL) {
893 ok:
894 		qdisc_bstats_update(sch, skb);
895 		qdisc_qstats_backlog_dec(sch, skb);
896 		sch->q.qlen--;
897 		return skb;
898 	}
899 
900 	if (!sch->q.qlen)
901 		goto fin;
902 	q->now = ktime_get_ns();
903 	start_at = jiffies;
904 
905 	next_event = q->now + 5LLU * NSEC_PER_SEC;
906 
907 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
908 		/* common case optimization - skip event handler quickly */
909 		int m;
910 		s64 event = q->near_ev_cache[level];
911 
912 		if (q->now >= event) {
913 			event = htb_do_events(q, level, start_at);
914 			if (!event)
915 				event = q->now + NSEC_PER_SEC;
916 			q->near_ev_cache[level] = event;
917 		}
918 
919 		if (next_event > event)
920 			next_event = event;
921 
922 		m = ~q->row_mask[level];
923 		while (m != (int)(-1)) {
924 			int prio = ffz(m);
925 
926 			m |= 1 << prio;
927 			skb = htb_dequeue_tree(q, prio, level);
928 			if (likely(skb != NULL))
929 				goto ok;
930 		}
931 	}
932 	qdisc_qstats_overlimit(sch);
933 	if (likely(next_event > q->now))
934 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
935 	else
936 		schedule_work(&q->work);
937 fin:
938 	return skb;
939 }
940 
941 /* reset all classes */
942 /* always caled under BH & queue lock */
943 static void htb_reset(struct Qdisc *sch)
944 {
945 	struct htb_sched *q = qdisc_priv(sch);
946 	struct htb_class *cl;
947 	unsigned int i;
948 
949 	for (i = 0; i < q->clhash.hashsize; i++) {
950 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
951 			if (cl->level)
952 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
953 			else {
954 				if (cl->un.leaf.q)
955 					qdisc_reset(cl->un.leaf.q);
956 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
957 			}
958 			cl->prio_activity = 0;
959 			cl->cmode = HTB_CAN_SEND;
960 		}
961 	}
962 	qdisc_watchdog_cancel(&q->watchdog);
963 	__qdisc_reset_queue(&q->direct_queue);
964 	sch->q.qlen = 0;
965 	sch->qstats.backlog = 0;
966 	memset(q->hlevel, 0, sizeof(q->hlevel));
967 	memset(q->row_mask, 0, sizeof(q->row_mask));
968 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
969 		INIT_LIST_HEAD(q->drops + i);
970 }
971 
972 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
973 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
974 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
975 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
976 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
977 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
978 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
979 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
980 };
981 
982 static void htb_work_func(struct work_struct *work)
983 {
984 	struct htb_sched *q = container_of(work, struct htb_sched, work);
985 	struct Qdisc *sch = q->watchdog.qdisc;
986 
987 	rcu_read_lock();
988 	__netif_schedule(qdisc_root(sch));
989 	rcu_read_unlock();
990 }
991 
992 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
993 {
994 	struct htb_sched *q = qdisc_priv(sch);
995 	struct nlattr *tb[TCA_HTB_MAX + 1];
996 	struct tc_htb_glob *gopt;
997 	int err;
998 	int i;
999 
1000 	if (!opt)
1001 		return -EINVAL;
1002 
1003 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1004 	if (err < 0)
1005 		return err;
1006 
1007 	if (!tb[TCA_HTB_INIT])
1008 		return -EINVAL;
1009 
1010 	gopt = nla_data(tb[TCA_HTB_INIT]);
1011 	if (gopt->version != HTB_VER >> 16)
1012 		return -EINVAL;
1013 
1014 	err = qdisc_class_hash_init(&q->clhash);
1015 	if (err < 0)
1016 		return err;
1017 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1018 		INIT_LIST_HEAD(q->drops + i);
1019 
1020 	qdisc_watchdog_init(&q->watchdog, sch);
1021 	INIT_WORK(&q->work, htb_work_func);
1022 	__skb_queue_head_init(&q->direct_queue);
1023 
1024 	if (tb[TCA_HTB_DIRECT_QLEN])
1025 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1026 	else
1027 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1028 
1029 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1030 		q->rate2quantum = 1;
1031 	q->defcls = gopt->defcls;
1032 
1033 	return 0;
1034 }
1035 
1036 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1037 {
1038 	struct htb_sched *q = qdisc_priv(sch);
1039 	struct nlattr *nest;
1040 	struct tc_htb_glob gopt;
1041 
1042 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1043 	 * no change can happen on the qdisc parameters.
1044 	 */
1045 
1046 	gopt.direct_pkts = q->direct_pkts;
1047 	gopt.version = HTB_VER;
1048 	gopt.rate2quantum = q->rate2quantum;
1049 	gopt.defcls = q->defcls;
1050 	gopt.debug = 0;
1051 
1052 	nest = nla_nest_start(skb, TCA_OPTIONS);
1053 	if (nest == NULL)
1054 		goto nla_put_failure;
1055 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1056 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1057 		goto nla_put_failure;
1058 
1059 	return nla_nest_end(skb, nest);
1060 
1061 nla_put_failure:
1062 	nla_nest_cancel(skb, nest);
1063 	return -1;
1064 }
1065 
1066 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1067 			  struct sk_buff *skb, struct tcmsg *tcm)
1068 {
1069 	struct htb_class *cl = (struct htb_class *)arg;
1070 	struct nlattr *nest;
1071 	struct tc_htb_opt opt;
1072 
1073 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1074 	 * no change can happen on the class parameters.
1075 	 */
1076 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1077 	tcm->tcm_handle = cl->common.classid;
1078 	if (!cl->level && cl->un.leaf.q)
1079 		tcm->tcm_info = cl->un.leaf.q->handle;
1080 
1081 	nest = nla_nest_start(skb, TCA_OPTIONS);
1082 	if (nest == NULL)
1083 		goto nla_put_failure;
1084 
1085 	memset(&opt, 0, sizeof(opt));
1086 
1087 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1088 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1089 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1090 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1091 	opt.quantum = cl->quantum;
1092 	opt.prio = cl->prio;
1093 	opt.level = cl->level;
1094 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1095 		goto nla_put_failure;
1096 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1097 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1098 			      TCA_HTB_PAD))
1099 		goto nla_put_failure;
1100 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1101 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1102 			      TCA_HTB_PAD))
1103 		goto nla_put_failure;
1104 
1105 	return nla_nest_end(skb, nest);
1106 
1107 nla_put_failure:
1108 	nla_nest_cancel(skb, nest);
1109 	return -1;
1110 }
1111 
1112 static int
1113 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1114 {
1115 	struct htb_class *cl = (struct htb_class *)arg;
1116 	struct gnet_stats_queue qs = {
1117 		.drops = cl->drops,
1118 	};
1119 	__u32 qlen = 0;
1120 
1121 	if (!cl->level && cl->un.leaf.q) {
1122 		qlen = cl->un.leaf.q->q.qlen;
1123 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1124 	}
1125 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1126 				    INT_MIN, INT_MAX);
1127 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1128 				     INT_MIN, INT_MAX);
1129 
1130 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1131 				  d, NULL, &cl->bstats) < 0 ||
1132 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1133 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1134 		return -1;
1135 
1136 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1137 }
1138 
1139 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1140 		     struct Qdisc **old)
1141 {
1142 	struct htb_class *cl = (struct htb_class *)arg;
1143 
1144 	if (cl->level)
1145 		return -EINVAL;
1146 	if (new == NULL &&
1147 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1148 				     cl->common.classid)) == NULL)
1149 		return -ENOBUFS;
1150 
1151 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1152 	return 0;
1153 }
1154 
1155 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1156 {
1157 	struct htb_class *cl = (struct htb_class *)arg;
1158 	return !cl->level ? cl->un.leaf.q : NULL;
1159 }
1160 
1161 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1162 {
1163 	struct htb_class *cl = (struct htb_class *)arg;
1164 
1165 	if (cl->un.leaf.q->q.qlen == 0)
1166 		htb_deactivate(qdisc_priv(sch), cl);
1167 }
1168 
1169 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1170 {
1171 	struct htb_class *cl = htb_find(classid, sch);
1172 	if (cl)
1173 		cl->refcnt++;
1174 	return (unsigned long)cl;
1175 }
1176 
1177 static inline int htb_parent_last_child(struct htb_class *cl)
1178 {
1179 	if (!cl->parent)
1180 		/* the root class */
1181 		return 0;
1182 	if (cl->parent->children > 1)
1183 		/* not the last child */
1184 		return 0;
1185 	return 1;
1186 }
1187 
1188 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1189 			       struct Qdisc *new_q)
1190 {
1191 	struct htb_class *parent = cl->parent;
1192 
1193 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1194 
1195 	if (parent->cmode != HTB_CAN_SEND)
1196 		htb_safe_rb_erase(&parent->pq_node,
1197 				  &q->hlevel[parent->level].wait_pq);
1198 
1199 	parent->level = 0;
1200 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1201 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1202 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1203 	parent->tokens = parent->buffer;
1204 	parent->ctokens = parent->cbuffer;
1205 	parent->t_c = ktime_get_ns();
1206 	parent->cmode = HTB_CAN_SEND;
1207 }
1208 
1209 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1210 {
1211 	if (!cl->level) {
1212 		WARN_ON(!cl->un.leaf.q);
1213 		qdisc_destroy(cl->un.leaf.q);
1214 	}
1215 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1216 	tcf_destroy_chain(&cl->filter_list);
1217 	kfree(cl);
1218 }
1219 
1220 static void htb_destroy(struct Qdisc *sch)
1221 {
1222 	struct htb_sched *q = qdisc_priv(sch);
1223 	struct hlist_node *next;
1224 	struct htb_class *cl;
1225 	unsigned int i;
1226 
1227 	cancel_work_sync(&q->work);
1228 	qdisc_watchdog_cancel(&q->watchdog);
1229 	/* This line used to be after htb_destroy_class call below
1230 	 * and surprisingly it worked in 2.4. But it must precede it
1231 	 * because filter need its target class alive to be able to call
1232 	 * unbind_filter on it (without Oops).
1233 	 */
1234 	tcf_destroy_chain(&q->filter_list);
1235 
1236 	for (i = 0; i < q->clhash.hashsize; i++) {
1237 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1238 			tcf_destroy_chain(&cl->filter_list);
1239 	}
1240 	for (i = 0; i < q->clhash.hashsize; i++) {
1241 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1242 					  common.hnode)
1243 			htb_destroy_class(sch, cl);
1244 	}
1245 	qdisc_class_hash_destroy(&q->clhash);
1246 	__qdisc_reset_queue(&q->direct_queue);
1247 }
1248 
1249 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1250 {
1251 	struct htb_sched *q = qdisc_priv(sch);
1252 	struct htb_class *cl = (struct htb_class *)arg;
1253 	struct Qdisc *new_q = NULL;
1254 	int last_child = 0;
1255 
1256 	/* TODO: why don't allow to delete subtree ? references ? does
1257 	 * tc subsys guarantee us that in htb_destroy it holds no class
1258 	 * refs so that we can remove children safely there ?
1259 	 */
1260 	if (cl->children || cl->filter_cnt)
1261 		return -EBUSY;
1262 
1263 	if (!cl->level && htb_parent_last_child(cl)) {
1264 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1265 					  cl->parent->common.classid);
1266 		last_child = 1;
1267 	}
1268 
1269 	sch_tree_lock(sch);
1270 
1271 	if (!cl->level) {
1272 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1273 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1274 
1275 		qdisc_reset(cl->un.leaf.q);
1276 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1277 	}
1278 
1279 	/* delete from hash and active; remainder in destroy_class */
1280 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1281 	if (cl->parent)
1282 		cl->parent->children--;
1283 
1284 	if (cl->prio_activity)
1285 		htb_deactivate(q, cl);
1286 
1287 	if (cl->cmode != HTB_CAN_SEND)
1288 		htb_safe_rb_erase(&cl->pq_node,
1289 				  &q->hlevel[cl->level].wait_pq);
1290 
1291 	if (last_child)
1292 		htb_parent_to_leaf(q, cl, new_q);
1293 
1294 	BUG_ON(--cl->refcnt == 0);
1295 	/*
1296 	 * This shouldn't happen: we "hold" one cops->get() when called
1297 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1298 	 */
1299 
1300 	sch_tree_unlock(sch);
1301 	return 0;
1302 }
1303 
1304 static void htb_put(struct Qdisc *sch, unsigned long arg)
1305 {
1306 	struct htb_class *cl = (struct htb_class *)arg;
1307 
1308 	if (--cl->refcnt == 0)
1309 		htb_destroy_class(sch, cl);
1310 }
1311 
1312 static int htb_change_class(struct Qdisc *sch, u32 classid,
1313 			    u32 parentid, struct nlattr **tca,
1314 			    unsigned long *arg)
1315 {
1316 	int err = -EINVAL;
1317 	struct htb_sched *q = qdisc_priv(sch);
1318 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1319 	struct nlattr *opt = tca[TCA_OPTIONS];
1320 	struct nlattr *tb[TCA_HTB_MAX + 1];
1321 	struct tc_htb_opt *hopt;
1322 	u64 rate64, ceil64;
1323 
1324 	/* extract all subattrs from opt attr */
1325 	if (!opt)
1326 		goto failure;
1327 
1328 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1329 	if (err < 0)
1330 		goto failure;
1331 
1332 	err = -EINVAL;
1333 	if (tb[TCA_HTB_PARMS] == NULL)
1334 		goto failure;
1335 
1336 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1337 
1338 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1339 	if (!hopt->rate.rate || !hopt->ceil.rate)
1340 		goto failure;
1341 
1342 	/* Keeping backward compatible with rate_table based iproute2 tc */
1343 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1344 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1345 
1346 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1347 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1348 
1349 	if (!cl) {		/* new class */
1350 		struct Qdisc *new_q;
1351 		int prio;
1352 		struct {
1353 			struct nlattr		nla;
1354 			struct gnet_estimator	opt;
1355 		} est = {
1356 			.nla = {
1357 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1358 				.nla_type	= TCA_RATE,
1359 			},
1360 			.opt = {
1361 				/* 4s interval, 16s averaging constant */
1362 				.interval	= 2,
1363 				.ewma_log	= 2,
1364 			},
1365 		};
1366 
1367 		/* check for valid classid */
1368 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1369 		    htb_find(classid, sch))
1370 			goto failure;
1371 
1372 		/* check maximal depth */
1373 		if (parent && parent->parent && parent->parent->level < 2) {
1374 			pr_err("htb: tree is too deep\n");
1375 			goto failure;
1376 		}
1377 		err = -ENOBUFS;
1378 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1379 		if (!cl)
1380 			goto failure;
1381 
1382 		if (htb_rate_est || tca[TCA_RATE]) {
1383 			err = gen_new_estimator(&cl->bstats, NULL,
1384 						&cl->rate_est,
1385 						NULL,
1386 						qdisc_root_sleeping_running(sch),
1387 						tca[TCA_RATE] ? : &est.nla);
1388 			if (err) {
1389 				kfree(cl);
1390 				goto failure;
1391 			}
1392 		}
1393 
1394 		cl->refcnt = 1;
1395 		cl->children = 0;
1396 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1397 		RB_CLEAR_NODE(&cl->pq_node);
1398 
1399 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1400 			RB_CLEAR_NODE(&cl->node[prio]);
1401 
1402 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1403 		 * so that can't be used inside of sch_tree_lock
1404 		 * -- thanks to Karlis Peisenieks
1405 		 */
1406 		new_q = qdisc_create_dflt(sch->dev_queue,
1407 					  &pfifo_qdisc_ops, classid);
1408 		sch_tree_lock(sch);
1409 		if (parent && !parent->level) {
1410 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1411 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1412 
1413 			/* turn parent into inner node */
1414 			qdisc_reset(parent->un.leaf.q);
1415 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1416 			qdisc_destroy(parent->un.leaf.q);
1417 			if (parent->prio_activity)
1418 				htb_deactivate(q, parent);
1419 
1420 			/* remove from evt list because of level change */
1421 			if (parent->cmode != HTB_CAN_SEND) {
1422 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1423 				parent->cmode = HTB_CAN_SEND;
1424 			}
1425 			parent->level = (parent->parent ? parent->parent->level
1426 					 : TC_HTB_MAXDEPTH) - 1;
1427 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1428 		}
1429 		/* leaf (we) needs elementary qdisc */
1430 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1431 
1432 		cl->common.classid = classid;
1433 		cl->parent = parent;
1434 
1435 		/* set class to be in HTB_CAN_SEND state */
1436 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1437 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1438 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1439 		cl->t_c = ktime_get_ns();
1440 		cl->cmode = HTB_CAN_SEND;
1441 
1442 		/* attach to the hash list and parent's family */
1443 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1444 		if (parent)
1445 			parent->children++;
1446 	} else {
1447 		if (tca[TCA_RATE]) {
1448 			err = gen_replace_estimator(&cl->bstats, NULL,
1449 						    &cl->rate_est,
1450 						    NULL,
1451 						    qdisc_root_sleeping_running(sch),
1452 						    tca[TCA_RATE]);
1453 			if (err)
1454 				return err;
1455 		}
1456 		sch_tree_lock(sch);
1457 	}
1458 
1459 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1460 
1461 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1462 
1463 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1464 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1465 
1466 	/* it used to be a nasty bug here, we have to check that node
1467 	 * is really leaf before changing cl->un.leaf !
1468 	 */
1469 	if (!cl->level) {
1470 		u64 quantum = cl->rate.rate_bytes_ps;
1471 
1472 		do_div(quantum, q->rate2quantum);
1473 		cl->quantum = min_t(u64, quantum, INT_MAX);
1474 
1475 		if (!hopt->quantum && cl->quantum < 1000) {
1476 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1477 				cl->common.classid);
1478 			cl->quantum = 1000;
1479 		}
1480 		if (!hopt->quantum && cl->quantum > 200000) {
1481 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1482 				cl->common.classid);
1483 			cl->quantum = 200000;
1484 		}
1485 		if (hopt->quantum)
1486 			cl->quantum = hopt->quantum;
1487 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1488 			cl->prio = TC_HTB_NUMPRIO - 1;
1489 	}
1490 
1491 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1492 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1493 
1494 	sch_tree_unlock(sch);
1495 
1496 	qdisc_class_hash_grow(sch, &q->clhash);
1497 
1498 	*arg = (unsigned long)cl;
1499 	return 0;
1500 
1501 failure:
1502 	return err;
1503 }
1504 
1505 static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch,
1506 					     unsigned long arg)
1507 {
1508 	struct htb_sched *q = qdisc_priv(sch);
1509 	struct htb_class *cl = (struct htb_class *)arg;
1510 	struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list;
1511 
1512 	return fl;
1513 }
1514 
1515 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1516 				     u32 classid)
1517 {
1518 	struct htb_class *cl = htb_find(classid, sch);
1519 
1520 	/*if (cl && !cl->level) return 0;
1521 	 * The line above used to be there to prevent attaching filters to
1522 	 * leaves. But at least tc_index filter uses this just to get class
1523 	 * for other reasons so that we have to allow for it.
1524 	 * ----
1525 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1526 	 * another way to "lock" the class - unlike "get" this lock can
1527 	 * be broken by class during destroy IIUC.
1528 	 */
1529 	if (cl)
1530 		cl->filter_cnt++;
1531 	return (unsigned long)cl;
1532 }
1533 
1534 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1535 {
1536 	struct htb_class *cl = (struct htb_class *)arg;
1537 
1538 	if (cl)
1539 		cl->filter_cnt--;
1540 }
1541 
1542 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1543 {
1544 	struct htb_sched *q = qdisc_priv(sch);
1545 	struct htb_class *cl;
1546 	unsigned int i;
1547 
1548 	if (arg->stop)
1549 		return;
1550 
1551 	for (i = 0; i < q->clhash.hashsize; i++) {
1552 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1553 			if (arg->count < arg->skip) {
1554 				arg->count++;
1555 				continue;
1556 			}
1557 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1558 				arg->stop = 1;
1559 				return;
1560 			}
1561 			arg->count++;
1562 		}
1563 	}
1564 }
1565 
1566 static const struct Qdisc_class_ops htb_class_ops = {
1567 	.graft		=	htb_graft,
1568 	.leaf		=	htb_leaf,
1569 	.qlen_notify	=	htb_qlen_notify,
1570 	.get		=	htb_get,
1571 	.put		=	htb_put,
1572 	.change		=	htb_change_class,
1573 	.delete		=	htb_delete,
1574 	.walk		=	htb_walk,
1575 	.tcf_chain	=	htb_find_tcf,
1576 	.bind_tcf	=	htb_bind_filter,
1577 	.unbind_tcf	=	htb_unbind_filter,
1578 	.dump		=	htb_dump_class,
1579 	.dump_stats	=	htb_dump_class_stats,
1580 };
1581 
1582 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1583 	.cl_ops		=	&htb_class_ops,
1584 	.id		=	"htb",
1585 	.priv_size	=	sizeof(struct htb_sched),
1586 	.enqueue	=	htb_enqueue,
1587 	.dequeue	=	htb_dequeue,
1588 	.peek		=	qdisc_peek_dequeued,
1589 	.init		=	htb_init,
1590 	.reset		=	htb_reset,
1591 	.destroy	=	htb_destroy,
1592 	.dump		=	htb_dump,
1593 	.owner		=	THIS_MODULE,
1594 };
1595 
1596 static int __init htb_module_init(void)
1597 {
1598 	return register_qdisc(&htb_qdisc_ops);
1599 }
1600 static void __exit htb_module_exit(void)
1601 {
1602 	unregister_qdisc(&htb_qdisc_ops);
1603 }
1604 
1605 module_init(htb_module_init)
1606 module_exit(htb_module_exit)
1607 MODULE_LICENSE("GPL");
1608