xref: /openbmc/linux/net/sched/sch_htb.c (revision 4a3fad70)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 #include <net/pkt_cls.h>
44 
45 /* HTB algorithm.
46     Author: devik@cdi.cz
47     ========================================================================
48     HTB is like TBF with multiple classes. It is also similar to CBQ because
49     it allows to assign priority to each class in hierarchy.
50     In fact it is another implementation of Floyd's formal sharing.
51 
52     Levels:
53     Each class is assigned level. Leaf has ALWAYS level 0 and root
54     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
55     one less than their parent.
56 */
57 
58 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
59 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
60 
61 #if HTB_VER >> 16 != TC_HTB_PROTOVER
62 #error "Mismatched sch_htb.c and pkt_sch.h"
63 #endif
64 
65 /* Module parameter and sysfs export */
66 module_param    (htb_hysteresis, int, 0640);
67 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
68 
69 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
70 module_param(htb_rate_est, int, 0640);
71 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
72 
73 /* used internaly to keep status of single class */
74 enum htb_cmode {
75 	HTB_CANT_SEND,		/* class can't send and can't borrow */
76 	HTB_MAY_BORROW,		/* class can't send but may borrow */
77 	HTB_CAN_SEND		/* class can send */
78 };
79 
80 struct htb_prio {
81 	union {
82 		struct rb_root	row;
83 		struct rb_root	feed;
84 	};
85 	struct rb_node	*ptr;
86 	/* When class changes from state 1->2 and disconnects from
87 	 * parent's feed then we lost ptr value and start from the
88 	 * first child again. Here we store classid of the
89 	 * last valid ptr (used when ptr is NULL).
90 	 */
91 	u32		last_ptr_id;
92 };
93 
94 /* interior & leaf nodes; props specific to leaves are marked L:
95  * To reduce false sharing, place mostly read fields at beginning,
96  * and mostly written ones at the end.
97  */
98 struct htb_class {
99 	struct Qdisc_class_common common;
100 	struct psched_ratecfg	rate;
101 	struct psched_ratecfg	ceil;
102 	s64			buffer, cbuffer;/* token bucket depth/rate */
103 	s64			mbuffer;	/* max wait time */
104 	u32			prio;		/* these two are used only by leaves... */
105 	int			quantum;	/* but stored for parent-to-leaf return */
106 
107 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
108 	struct tcf_block	*block;
109 	int			filter_cnt;
110 
111 	int			level;		/* our level (see above) */
112 	unsigned int		children;
113 	struct htb_class	*parent;	/* parent class */
114 
115 	struct net_rate_estimator __rcu *rate_est;
116 
117 	/*
118 	 * Written often fields
119 	 */
120 	struct gnet_stats_basic_packed bstats;
121 	struct tc_htb_xstats	xstats;	/* our special stats */
122 
123 	/* token bucket parameters */
124 	s64			tokens, ctokens;/* current number of tokens */
125 	s64			t_c;		/* checkpoint time */
126 
127 	union {
128 		struct htb_class_leaf {
129 			struct list_head drop_list;
130 			int		deficit[TC_HTB_MAXDEPTH];
131 			struct Qdisc	*q;
132 		} leaf;
133 		struct htb_class_inner {
134 			struct htb_prio clprio[TC_HTB_NUMPRIO];
135 		} inner;
136 	} un;
137 	s64			pq_key;
138 
139 	int			prio_activity;	/* for which prios are we active */
140 	enum htb_cmode		cmode;		/* current mode of the class */
141 	struct rb_node		pq_node;	/* node for event queue */
142 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
143 
144 	unsigned int drops ____cacheline_aligned_in_smp;
145 	unsigned int		overlimits;
146 };
147 
148 struct htb_level {
149 	struct rb_root	wait_pq;
150 	struct htb_prio hprio[TC_HTB_NUMPRIO];
151 };
152 
153 struct htb_sched {
154 	struct Qdisc_class_hash clhash;
155 	int			defcls;		/* class where unclassified flows go to */
156 	int			rate2quantum;	/* quant = rate / rate2quantum */
157 
158 	/* filters for qdisc itself */
159 	struct tcf_proto __rcu	*filter_list;
160 	struct tcf_block	*block;
161 
162 #define HTB_WARN_TOOMANYEVENTS	0x1
163 	unsigned int		warned;	/* only one warning */
164 	int			direct_qlen;
165 	struct work_struct	work;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct qdisc_skb_head	direct_queue;
169 	long			direct_pkts;
170 
171 	struct qdisc_watchdog	watchdog;
172 
173 	s64			now;	/* cached dequeue time */
174 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
175 
176 	/* time of nearest event per level (row) */
177 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
178 
179 	int			row_mask[TC_HTB_MAXDEPTH];
180 
181 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
182 };
183 
184 /* find class in global hash table using given handle */
185 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
186 {
187 	struct htb_sched *q = qdisc_priv(sch);
188 	struct Qdisc_class_common *clc;
189 
190 	clc = qdisc_class_find(&q->clhash, handle);
191 	if (clc == NULL)
192 		return NULL;
193 	return container_of(clc, struct htb_class, common);
194 }
195 
196 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
197 {
198 	return (unsigned long)htb_find(handle, sch);
199 }
200 /**
201  * htb_classify - classify a packet into class
202  *
203  * It returns NULL if the packet should be dropped or -1 if the packet
204  * should be passed directly thru. In all other cases leaf class is returned.
205  * We allow direct class selection by classid in priority. The we examine
206  * filters in qdisc and in inner nodes (if higher filter points to the inner
207  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
208  * internal fifo (direct). These packets then go directly thru. If we still
209  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
210  * then finish and return direct queue.
211  */
212 #define HTB_DIRECT ((struct htb_class *)-1L)
213 
214 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
215 				      int *qerr)
216 {
217 	struct htb_sched *q = qdisc_priv(sch);
218 	struct htb_class *cl;
219 	struct tcf_result res;
220 	struct tcf_proto *tcf;
221 	int result;
222 
223 	/* allow to select class by setting skb->priority to valid classid;
224 	 * note that nfmark can be used too by attaching filter fw with no
225 	 * rules in it
226 	 */
227 	if (skb->priority == sch->handle)
228 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
229 	cl = htb_find(skb->priority, sch);
230 	if (cl) {
231 		if (cl->level == 0)
232 			return cl;
233 		/* Start with inner filter chain if a non-leaf class is selected */
234 		tcf = rcu_dereference_bh(cl->filter_list);
235 	} else {
236 		tcf = rcu_dereference_bh(q->filter_list);
237 	}
238 
239 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
240 	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
241 #ifdef CONFIG_NET_CLS_ACT
242 		switch (result) {
243 		case TC_ACT_QUEUED:
244 		case TC_ACT_STOLEN:
245 		case TC_ACT_TRAP:
246 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
247 			/* fall through */
248 		case TC_ACT_SHOT:
249 			return NULL;
250 		}
251 #endif
252 		cl = (void *)res.class;
253 		if (!cl) {
254 			if (res.classid == sch->handle)
255 				return HTB_DIRECT;	/* X:0 (direct flow) */
256 			cl = htb_find(res.classid, sch);
257 			if (!cl)
258 				break;	/* filter selected invalid classid */
259 		}
260 		if (!cl->level)
261 			return cl;	/* we hit leaf; return it */
262 
263 		/* we have got inner class; apply inner filter chain */
264 		tcf = rcu_dereference_bh(cl->filter_list);
265 	}
266 	/* classification failed; try to use default class */
267 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
268 	if (!cl || cl->level)
269 		return HTB_DIRECT;	/* bad default .. this is safe bet */
270 	return cl;
271 }
272 
273 /**
274  * htb_add_to_id_tree - adds class to the round robin list
275  *
276  * Routine adds class to the list (actually tree) sorted by classid.
277  * Make sure that class is not already on such list for given prio.
278  */
279 static void htb_add_to_id_tree(struct rb_root *root,
280 			       struct htb_class *cl, int prio)
281 {
282 	struct rb_node **p = &root->rb_node, *parent = NULL;
283 
284 	while (*p) {
285 		struct htb_class *c;
286 		parent = *p;
287 		c = rb_entry(parent, struct htb_class, node[prio]);
288 
289 		if (cl->common.classid > c->common.classid)
290 			p = &parent->rb_right;
291 		else
292 			p = &parent->rb_left;
293 	}
294 	rb_link_node(&cl->node[prio], parent, p);
295 	rb_insert_color(&cl->node[prio], root);
296 }
297 
298 /**
299  * htb_add_to_wait_tree - adds class to the event queue with delay
300  *
301  * The class is added to priority event queue to indicate that class will
302  * change its mode in cl->pq_key microseconds. Make sure that class is not
303  * already in the queue.
304  */
305 static void htb_add_to_wait_tree(struct htb_sched *q,
306 				 struct htb_class *cl, s64 delay)
307 {
308 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
309 
310 	cl->pq_key = q->now + delay;
311 	if (cl->pq_key == q->now)
312 		cl->pq_key++;
313 
314 	/* update the nearest event cache */
315 	if (q->near_ev_cache[cl->level] > cl->pq_key)
316 		q->near_ev_cache[cl->level] = cl->pq_key;
317 
318 	while (*p) {
319 		struct htb_class *c;
320 		parent = *p;
321 		c = rb_entry(parent, struct htb_class, pq_node);
322 		if (cl->pq_key >= c->pq_key)
323 			p = &parent->rb_right;
324 		else
325 			p = &parent->rb_left;
326 	}
327 	rb_link_node(&cl->pq_node, parent, p);
328 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
329 }
330 
331 /**
332  * htb_next_rb_node - finds next node in binary tree
333  *
334  * When we are past last key we return NULL.
335  * Average complexity is 2 steps per call.
336  */
337 static inline void htb_next_rb_node(struct rb_node **n)
338 {
339 	*n = rb_next(*n);
340 }
341 
342 /**
343  * htb_add_class_to_row - add class to its row
344  *
345  * The class is added to row at priorities marked in mask.
346  * It does nothing if mask == 0.
347  */
348 static inline void htb_add_class_to_row(struct htb_sched *q,
349 					struct htb_class *cl, int mask)
350 {
351 	q->row_mask[cl->level] |= mask;
352 	while (mask) {
353 		int prio = ffz(~mask);
354 		mask &= ~(1 << prio);
355 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
356 	}
357 }
358 
359 /* If this triggers, it is a bug in this code, but it need not be fatal */
360 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
361 {
362 	if (RB_EMPTY_NODE(rb)) {
363 		WARN_ON(1);
364 	} else {
365 		rb_erase(rb, root);
366 		RB_CLEAR_NODE(rb);
367 	}
368 }
369 
370 
371 /**
372  * htb_remove_class_from_row - removes class from its row
373  *
374  * The class is removed from row at priorities marked in mask.
375  * It does nothing if mask == 0.
376  */
377 static inline void htb_remove_class_from_row(struct htb_sched *q,
378 						 struct htb_class *cl, int mask)
379 {
380 	int m = 0;
381 	struct htb_level *hlevel = &q->hlevel[cl->level];
382 
383 	while (mask) {
384 		int prio = ffz(~mask);
385 		struct htb_prio *hprio = &hlevel->hprio[prio];
386 
387 		mask &= ~(1 << prio);
388 		if (hprio->ptr == cl->node + prio)
389 			htb_next_rb_node(&hprio->ptr);
390 
391 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
392 		if (!hprio->row.rb_node)
393 			m |= 1 << prio;
394 	}
395 	q->row_mask[cl->level] &= ~m;
396 }
397 
398 /**
399  * htb_activate_prios - creates active classe's feed chain
400  *
401  * The class is connected to ancestors and/or appropriate rows
402  * for priorities it is participating on. cl->cmode must be new
403  * (activated) mode. It does nothing if cl->prio_activity == 0.
404  */
405 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
406 {
407 	struct htb_class *p = cl->parent;
408 	long m, mask = cl->prio_activity;
409 
410 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
411 		m = mask;
412 		while (m) {
413 			int prio = ffz(~m);
414 			m &= ~(1 << prio);
415 
416 			if (p->un.inner.clprio[prio].feed.rb_node)
417 				/* parent already has its feed in use so that
418 				 * reset bit in mask as parent is already ok
419 				 */
420 				mask &= ~(1 << prio);
421 
422 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
423 		}
424 		p->prio_activity |= mask;
425 		cl = p;
426 		p = cl->parent;
427 
428 	}
429 	if (cl->cmode == HTB_CAN_SEND && mask)
430 		htb_add_class_to_row(q, cl, mask);
431 }
432 
433 /**
434  * htb_deactivate_prios - remove class from feed chain
435  *
436  * cl->cmode must represent old mode (before deactivation). It does
437  * nothing if cl->prio_activity == 0. Class is removed from all feed
438  * chains and rows.
439  */
440 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
441 {
442 	struct htb_class *p = cl->parent;
443 	long m, mask = cl->prio_activity;
444 
445 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
446 		m = mask;
447 		mask = 0;
448 		while (m) {
449 			int prio = ffz(~m);
450 			m &= ~(1 << prio);
451 
452 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
453 				/* we are removing child which is pointed to from
454 				 * parent feed - forget the pointer but remember
455 				 * classid
456 				 */
457 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
458 				p->un.inner.clprio[prio].ptr = NULL;
459 			}
460 
461 			htb_safe_rb_erase(cl->node + prio,
462 					  &p->un.inner.clprio[prio].feed);
463 
464 			if (!p->un.inner.clprio[prio].feed.rb_node)
465 				mask |= 1 << prio;
466 		}
467 
468 		p->prio_activity &= ~mask;
469 		cl = p;
470 		p = cl->parent;
471 
472 	}
473 	if (cl->cmode == HTB_CAN_SEND && mask)
474 		htb_remove_class_from_row(q, cl, mask);
475 }
476 
477 static inline s64 htb_lowater(const struct htb_class *cl)
478 {
479 	if (htb_hysteresis)
480 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
481 	else
482 		return 0;
483 }
484 static inline s64 htb_hiwater(const struct htb_class *cl)
485 {
486 	if (htb_hysteresis)
487 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
488 	else
489 		return 0;
490 }
491 
492 
493 /**
494  * htb_class_mode - computes and returns current class mode
495  *
496  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
497  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
498  * from now to time when cl will change its state.
499  * Also it is worth to note that class mode doesn't change simply
500  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
501  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
502  * mode transitions per time unit. The speed gain is about 1/6.
503  */
504 static inline enum htb_cmode
505 htb_class_mode(struct htb_class *cl, s64 *diff)
506 {
507 	s64 toks;
508 
509 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
510 		*diff = -toks;
511 		return HTB_CANT_SEND;
512 	}
513 
514 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
515 		return HTB_CAN_SEND;
516 
517 	*diff = -toks;
518 	return HTB_MAY_BORROW;
519 }
520 
521 /**
522  * htb_change_class_mode - changes classe's mode
523  *
524  * This should be the only way how to change classe's mode under normal
525  * cirsumstances. Routine will update feed lists linkage, change mode
526  * and add class to the wait event queue if appropriate. New mode should
527  * be different from old one and cl->pq_key has to be valid if changing
528  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
529  */
530 static void
531 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
532 {
533 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
534 
535 	if (new_mode == cl->cmode)
536 		return;
537 
538 	if (new_mode == HTB_CANT_SEND)
539 		cl->overlimits++;
540 
541 	if (cl->prio_activity) {	/* not necessary: speed optimization */
542 		if (cl->cmode != HTB_CANT_SEND)
543 			htb_deactivate_prios(q, cl);
544 		cl->cmode = new_mode;
545 		if (new_mode != HTB_CANT_SEND)
546 			htb_activate_prios(q, cl);
547 	} else
548 		cl->cmode = new_mode;
549 }
550 
551 /**
552  * htb_activate - inserts leaf cl into appropriate active feeds
553  *
554  * Routine learns (new) priority of leaf and activates feed chain
555  * for the prio. It can be called on already active leaf safely.
556  * It also adds leaf into droplist.
557  */
558 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
559 {
560 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
561 
562 	if (!cl->prio_activity) {
563 		cl->prio_activity = 1 << cl->prio;
564 		htb_activate_prios(q, cl);
565 		list_add_tail(&cl->un.leaf.drop_list,
566 			      q->drops + cl->prio);
567 	}
568 }
569 
570 /**
571  * htb_deactivate - remove leaf cl from active feeds
572  *
573  * Make sure that leaf is active. In the other words it can't be called
574  * with non-active leaf. It also removes class from the drop list.
575  */
576 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
577 {
578 	WARN_ON(!cl->prio_activity);
579 
580 	htb_deactivate_prios(q, cl);
581 	cl->prio_activity = 0;
582 	list_del_init(&cl->un.leaf.drop_list);
583 }
584 
585 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
586 			     struct qdisc_skb_head *qh)
587 {
588 	struct sk_buff *last = qh->tail;
589 
590 	if (last) {
591 		skb->next = NULL;
592 		last->next = skb;
593 		qh->tail = skb;
594 	} else {
595 		qh->tail = skb;
596 		qh->head = skb;
597 	}
598 	qh->qlen++;
599 }
600 
601 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
602 		       struct sk_buff **to_free)
603 {
604 	int uninitialized_var(ret);
605 	struct htb_sched *q = qdisc_priv(sch);
606 	struct htb_class *cl = htb_classify(skb, sch, &ret);
607 
608 	if (cl == HTB_DIRECT) {
609 		/* enqueue to helper queue */
610 		if (q->direct_queue.qlen < q->direct_qlen) {
611 			htb_enqueue_tail(skb, sch, &q->direct_queue);
612 			q->direct_pkts++;
613 		} else {
614 			return qdisc_drop(skb, sch, to_free);
615 		}
616 #ifdef CONFIG_NET_CLS_ACT
617 	} else if (!cl) {
618 		if (ret & __NET_XMIT_BYPASS)
619 			qdisc_qstats_drop(sch);
620 		__qdisc_drop(skb, to_free);
621 		return ret;
622 #endif
623 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
624 					to_free)) != NET_XMIT_SUCCESS) {
625 		if (net_xmit_drop_count(ret)) {
626 			qdisc_qstats_drop(sch);
627 			cl->drops++;
628 		}
629 		return ret;
630 	} else {
631 		htb_activate(q, cl);
632 	}
633 
634 	qdisc_qstats_backlog_inc(sch, skb);
635 	sch->q.qlen++;
636 	return NET_XMIT_SUCCESS;
637 }
638 
639 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
640 {
641 	s64 toks = diff + cl->tokens;
642 
643 	if (toks > cl->buffer)
644 		toks = cl->buffer;
645 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
646 	if (toks <= -cl->mbuffer)
647 		toks = 1 - cl->mbuffer;
648 
649 	cl->tokens = toks;
650 }
651 
652 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
653 {
654 	s64 toks = diff + cl->ctokens;
655 
656 	if (toks > cl->cbuffer)
657 		toks = cl->cbuffer;
658 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
659 	if (toks <= -cl->mbuffer)
660 		toks = 1 - cl->mbuffer;
661 
662 	cl->ctokens = toks;
663 }
664 
665 /**
666  * htb_charge_class - charges amount "bytes" to leaf and ancestors
667  *
668  * Routine assumes that packet "bytes" long was dequeued from leaf cl
669  * borrowing from "level". It accounts bytes to ceil leaky bucket for
670  * leaf and all ancestors and to rate bucket for ancestors at levels
671  * "level" and higher. It also handles possible change of mode resulting
672  * from the update. Note that mode can also increase here (MAY_BORROW to
673  * CAN_SEND) because we can use more precise clock that event queue here.
674  * In such case we remove class from event queue first.
675  */
676 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
677 			     int level, struct sk_buff *skb)
678 {
679 	int bytes = qdisc_pkt_len(skb);
680 	enum htb_cmode old_mode;
681 	s64 diff;
682 
683 	while (cl) {
684 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
685 		if (cl->level >= level) {
686 			if (cl->level == level)
687 				cl->xstats.lends++;
688 			htb_accnt_tokens(cl, bytes, diff);
689 		} else {
690 			cl->xstats.borrows++;
691 			cl->tokens += diff;	/* we moved t_c; update tokens */
692 		}
693 		htb_accnt_ctokens(cl, bytes, diff);
694 		cl->t_c = q->now;
695 
696 		old_mode = cl->cmode;
697 		diff = 0;
698 		htb_change_class_mode(q, cl, &diff);
699 		if (old_mode != cl->cmode) {
700 			if (old_mode != HTB_CAN_SEND)
701 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
702 			if (cl->cmode != HTB_CAN_SEND)
703 				htb_add_to_wait_tree(q, cl, diff);
704 		}
705 
706 		/* update basic stats except for leaves which are already updated */
707 		if (cl->level)
708 			bstats_update(&cl->bstats, skb);
709 
710 		cl = cl->parent;
711 	}
712 }
713 
714 /**
715  * htb_do_events - make mode changes to classes at the level
716  *
717  * Scans event queue for pending events and applies them. Returns time of
718  * next pending event (0 for no event in pq, q->now for too many events).
719  * Note: Applied are events whose have cl->pq_key <= q->now.
720  */
721 static s64 htb_do_events(struct htb_sched *q, const int level,
722 			 unsigned long start)
723 {
724 	/* don't run for longer than 2 jiffies; 2 is used instead of
725 	 * 1 to simplify things when jiffy is going to be incremented
726 	 * too soon
727 	 */
728 	unsigned long stop_at = start + 2;
729 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
730 
731 	while (time_before(jiffies, stop_at)) {
732 		struct htb_class *cl;
733 		s64 diff;
734 		struct rb_node *p = rb_first(wait_pq);
735 
736 		if (!p)
737 			return 0;
738 
739 		cl = rb_entry(p, struct htb_class, pq_node);
740 		if (cl->pq_key > q->now)
741 			return cl->pq_key;
742 
743 		htb_safe_rb_erase(p, wait_pq);
744 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
745 		htb_change_class_mode(q, cl, &diff);
746 		if (cl->cmode != HTB_CAN_SEND)
747 			htb_add_to_wait_tree(q, cl, diff);
748 	}
749 
750 	/* too much load - let's continue after a break for scheduling */
751 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
752 		pr_warn("htb: too many events!\n");
753 		q->warned |= HTB_WARN_TOOMANYEVENTS;
754 	}
755 
756 	return q->now;
757 }
758 
759 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
760  * is no such one exists.
761  */
762 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
763 					      u32 id)
764 {
765 	struct rb_node *r = NULL;
766 	while (n) {
767 		struct htb_class *cl =
768 		    rb_entry(n, struct htb_class, node[prio]);
769 
770 		if (id > cl->common.classid) {
771 			n = n->rb_right;
772 		} else if (id < cl->common.classid) {
773 			r = n;
774 			n = n->rb_left;
775 		} else {
776 			return n;
777 		}
778 	}
779 	return r;
780 }
781 
782 /**
783  * htb_lookup_leaf - returns next leaf class in DRR order
784  *
785  * Find leaf where current feed pointers points to.
786  */
787 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
788 {
789 	int i;
790 	struct {
791 		struct rb_node *root;
792 		struct rb_node **pptr;
793 		u32 *pid;
794 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
795 
796 	BUG_ON(!hprio->row.rb_node);
797 	sp->root = hprio->row.rb_node;
798 	sp->pptr = &hprio->ptr;
799 	sp->pid = &hprio->last_ptr_id;
800 
801 	for (i = 0; i < 65535; i++) {
802 		if (!*sp->pptr && *sp->pid) {
803 			/* ptr was invalidated but id is valid - try to recover
804 			 * the original or next ptr
805 			 */
806 			*sp->pptr =
807 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
808 		}
809 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
810 				 * can become out of date quickly
811 				 */
812 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
813 			*sp->pptr = sp->root;
814 			while ((*sp->pptr)->rb_left)
815 				*sp->pptr = (*sp->pptr)->rb_left;
816 			if (sp > stk) {
817 				sp--;
818 				if (!*sp->pptr) {
819 					WARN_ON(1);
820 					return NULL;
821 				}
822 				htb_next_rb_node(sp->pptr);
823 			}
824 		} else {
825 			struct htb_class *cl;
826 			struct htb_prio *clp;
827 
828 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
829 			if (!cl->level)
830 				return cl;
831 			clp = &cl->un.inner.clprio[prio];
832 			(++sp)->root = clp->feed.rb_node;
833 			sp->pptr = &clp->ptr;
834 			sp->pid = &clp->last_ptr_id;
835 		}
836 	}
837 	WARN_ON(1);
838 	return NULL;
839 }
840 
841 /* dequeues packet at given priority and level; call only if
842  * you are sure that there is active class at prio/level
843  */
844 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
845 					const int level)
846 {
847 	struct sk_buff *skb = NULL;
848 	struct htb_class *cl, *start;
849 	struct htb_level *hlevel = &q->hlevel[level];
850 	struct htb_prio *hprio = &hlevel->hprio[prio];
851 
852 	/* look initial class up in the row */
853 	start = cl = htb_lookup_leaf(hprio, prio);
854 
855 	do {
856 next:
857 		if (unlikely(!cl))
858 			return NULL;
859 
860 		/* class can be empty - it is unlikely but can be true if leaf
861 		 * qdisc drops packets in enqueue routine or if someone used
862 		 * graft operation on the leaf since last dequeue;
863 		 * simply deactivate and skip such class
864 		 */
865 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
866 			struct htb_class *next;
867 			htb_deactivate(q, cl);
868 
869 			/* row/level might become empty */
870 			if ((q->row_mask[level] & (1 << prio)) == 0)
871 				return NULL;
872 
873 			next = htb_lookup_leaf(hprio, prio);
874 
875 			if (cl == start)	/* fix start if we just deleted it */
876 				start = next;
877 			cl = next;
878 			goto next;
879 		}
880 
881 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
882 		if (likely(skb != NULL))
883 			break;
884 
885 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
886 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
887 					 &q->hlevel[0].hprio[prio].ptr);
888 		cl = htb_lookup_leaf(hprio, prio);
889 
890 	} while (cl != start);
891 
892 	if (likely(skb != NULL)) {
893 		bstats_update(&cl->bstats, skb);
894 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
895 		if (cl->un.leaf.deficit[level] < 0) {
896 			cl->un.leaf.deficit[level] += cl->quantum;
897 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
898 						 &q->hlevel[0].hprio[prio].ptr);
899 		}
900 		/* this used to be after charge_class but this constelation
901 		 * gives us slightly better performance
902 		 */
903 		if (!cl->un.leaf.q->q.qlen)
904 			htb_deactivate(q, cl);
905 		htb_charge_class(q, cl, level, skb);
906 	}
907 	return skb;
908 }
909 
910 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
911 {
912 	struct sk_buff *skb;
913 	struct htb_sched *q = qdisc_priv(sch);
914 	int level;
915 	s64 next_event;
916 	unsigned long start_at;
917 
918 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
919 	skb = __qdisc_dequeue_head(&q->direct_queue);
920 	if (skb != NULL) {
921 ok:
922 		qdisc_bstats_update(sch, skb);
923 		qdisc_qstats_backlog_dec(sch, skb);
924 		sch->q.qlen--;
925 		return skb;
926 	}
927 
928 	if (!sch->q.qlen)
929 		goto fin;
930 	q->now = ktime_get_ns();
931 	start_at = jiffies;
932 
933 	next_event = q->now + 5LLU * NSEC_PER_SEC;
934 
935 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
936 		/* common case optimization - skip event handler quickly */
937 		int m;
938 		s64 event = q->near_ev_cache[level];
939 
940 		if (q->now >= event) {
941 			event = htb_do_events(q, level, start_at);
942 			if (!event)
943 				event = q->now + NSEC_PER_SEC;
944 			q->near_ev_cache[level] = event;
945 		}
946 
947 		if (next_event > event)
948 			next_event = event;
949 
950 		m = ~q->row_mask[level];
951 		while (m != (int)(-1)) {
952 			int prio = ffz(m);
953 
954 			m |= 1 << prio;
955 			skb = htb_dequeue_tree(q, prio, level);
956 			if (likely(skb != NULL))
957 				goto ok;
958 		}
959 	}
960 	qdisc_qstats_overlimit(sch);
961 	if (likely(next_event > q->now))
962 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
963 	else
964 		schedule_work(&q->work);
965 fin:
966 	return skb;
967 }
968 
969 /* reset all classes */
970 /* always caled under BH & queue lock */
971 static void htb_reset(struct Qdisc *sch)
972 {
973 	struct htb_sched *q = qdisc_priv(sch);
974 	struct htb_class *cl;
975 	unsigned int i;
976 
977 	for (i = 0; i < q->clhash.hashsize; i++) {
978 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
979 			if (cl->level)
980 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
981 			else {
982 				if (cl->un.leaf.q)
983 					qdisc_reset(cl->un.leaf.q);
984 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
985 			}
986 			cl->prio_activity = 0;
987 			cl->cmode = HTB_CAN_SEND;
988 		}
989 	}
990 	qdisc_watchdog_cancel(&q->watchdog);
991 	__qdisc_reset_queue(&q->direct_queue);
992 	sch->q.qlen = 0;
993 	sch->qstats.backlog = 0;
994 	memset(q->hlevel, 0, sizeof(q->hlevel));
995 	memset(q->row_mask, 0, sizeof(q->row_mask));
996 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
997 		INIT_LIST_HEAD(q->drops + i);
998 }
999 
1000 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1001 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1002 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1003 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1004 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1005 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1006 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1007 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1008 };
1009 
1010 static void htb_work_func(struct work_struct *work)
1011 {
1012 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1013 	struct Qdisc *sch = q->watchdog.qdisc;
1014 
1015 	rcu_read_lock();
1016 	__netif_schedule(qdisc_root(sch));
1017 	rcu_read_unlock();
1018 }
1019 
1020 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1021 {
1022 	struct htb_sched *q = qdisc_priv(sch);
1023 	struct nlattr *tb[TCA_HTB_MAX + 1];
1024 	struct tc_htb_glob *gopt;
1025 	int err;
1026 	int i;
1027 
1028 	qdisc_watchdog_init(&q->watchdog, sch);
1029 	INIT_WORK(&q->work, htb_work_func);
1030 
1031 	if (!opt)
1032 		return -EINVAL;
1033 
1034 	err = tcf_block_get(&q->block, &q->filter_list, sch);
1035 	if (err)
1036 		return err;
1037 
1038 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1039 	if (err < 0)
1040 		return err;
1041 
1042 	if (!tb[TCA_HTB_INIT])
1043 		return -EINVAL;
1044 
1045 	gopt = nla_data(tb[TCA_HTB_INIT]);
1046 	if (gopt->version != HTB_VER >> 16)
1047 		return -EINVAL;
1048 
1049 	err = qdisc_class_hash_init(&q->clhash);
1050 	if (err < 0)
1051 		return err;
1052 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1053 		INIT_LIST_HEAD(q->drops + i);
1054 
1055 	qdisc_skb_head_init(&q->direct_queue);
1056 
1057 	if (tb[TCA_HTB_DIRECT_QLEN])
1058 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1059 	else
1060 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1061 
1062 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1063 		q->rate2quantum = 1;
1064 	q->defcls = gopt->defcls;
1065 
1066 	return 0;
1067 }
1068 
1069 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1070 {
1071 	struct htb_sched *q = qdisc_priv(sch);
1072 	struct nlattr *nest;
1073 	struct tc_htb_glob gopt;
1074 
1075 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1076 	 * no change can happen on the qdisc parameters.
1077 	 */
1078 
1079 	gopt.direct_pkts = q->direct_pkts;
1080 	gopt.version = HTB_VER;
1081 	gopt.rate2quantum = q->rate2quantum;
1082 	gopt.defcls = q->defcls;
1083 	gopt.debug = 0;
1084 
1085 	nest = nla_nest_start(skb, TCA_OPTIONS);
1086 	if (nest == NULL)
1087 		goto nla_put_failure;
1088 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1089 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1090 		goto nla_put_failure;
1091 
1092 	return nla_nest_end(skb, nest);
1093 
1094 nla_put_failure:
1095 	nla_nest_cancel(skb, nest);
1096 	return -1;
1097 }
1098 
1099 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1100 			  struct sk_buff *skb, struct tcmsg *tcm)
1101 {
1102 	struct htb_class *cl = (struct htb_class *)arg;
1103 	struct nlattr *nest;
1104 	struct tc_htb_opt opt;
1105 
1106 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1107 	 * no change can happen on the class parameters.
1108 	 */
1109 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1110 	tcm->tcm_handle = cl->common.classid;
1111 	if (!cl->level && cl->un.leaf.q)
1112 		tcm->tcm_info = cl->un.leaf.q->handle;
1113 
1114 	nest = nla_nest_start(skb, TCA_OPTIONS);
1115 	if (nest == NULL)
1116 		goto nla_put_failure;
1117 
1118 	memset(&opt, 0, sizeof(opt));
1119 
1120 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1121 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1122 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1123 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1124 	opt.quantum = cl->quantum;
1125 	opt.prio = cl->prio;
1126 	opt.level = cl->level;
1127 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1128 		goto nla_put_failure;
1129 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1130 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1131 			      TCA_HTB_PAD))
1132 		goto nla_put_failure;
1133 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1134 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1135 			      TCA_HTB_PAD))
1136 		goto nla_put_failure;
1137 
1138 	return nla_nest_end(skb, nest);
1139 
1140 nla_put_failure:
1141 	nla_nest_cancel(skb, nest);
1142 	return -1;
1143 }
1144 
1145 static int
1146 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1147 {
1148 	struct htb_class *cl = (struct htb_class *)arg;
1149 	struct gnet_stats_queue qs = {
1150 		.drops = cl->drops,
1151 		.overlimits = cl->overlimits,
1152 	};
1153 	__u32 qlen = 0;
1154 
1155 	if (!cl->level && cl->un.leaf.q) {
1156 		qlen = cl->un.leaf.q->q.qlen;
1157 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1158 	}
1159 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1160 				    INT_MIN, INT_MAX);
1161 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1162 				     INT_MIN, INT_MAX);
1163 
1164 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1165 				  d, NULL, &cl->bstats) < 0 ||
1166 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1167 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1168 		return -1;
1169 
1170 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1171 }
1172 
1173 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1174 		     struct Qdisc **old)
1175 {
1176 	struct htb_class *cl = (struct htb_class *)arg;
1177 
1178 	if (cl->level)
1179 		return -EINVAL;
1180 	if (new == NULL &&
1181 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1182 				     cl->common.classid)) == NULL)
1183 		return -ENOBUFS;
1184 
1185 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1186 	return 0;
1187 }
1188 
1189 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1190 {
1191 	struct htb_class *cl = (struct htb_class *)arg;
1192 	return !cl->level ? cl->un.leaf.q : NULL;
1193 }
1194 
1195 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1196 {
1197 	struct htb_class *cl = (struct htb_class *)arg;
1198 
1199 	htb_deactivate(qdisc_priv(sch), cl);
1200 }
1201 
1202 static inline int htb_parent_last_child(struct htb_class *cl)
1203 {
1204 	if (!cl->parent)
1205 		/* the root class */
1206 		return 0;
1207 	if (cl->parent->children > 1)
1208 		/* not the last child */
1209 		return 0;
1210 	return 1;
1211 }
1212 
1213 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1214 			       struct Qdisc *new_q)
1215 {
1216 	struct htb_class *parent = cl->parent;
1217 
1218 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1219 
1220 	if (parent->cmode != HTB_CAN_SEND)
1221 		htb_safe_rb_erase(&parent->pq_node,
1222 				  &q->hlevel[parent->level].wait_pq);
1223 
1224 	parent->level = 0;
1225 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1226 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1227 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1228 	parent->tokens = parent->buffer;
1229 	parent->ctokens = parent->cbuffer;
1230 	parent->t_c = ktime_get_ns();
1231 	parent->cmode = HTB_CAN_SEND;
1232 }
1233 
1234 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1235 {
1236 	if (!cl->level) {
1237 		WARN_ON(!cl->un.leaf.q);
1238 		qdisc_destroy(cl->un.leaf.q);
1239 	}
1240 	gen_kill_estimator(&cl->rate_est);
1241 	tcf_block_put(cl->block);
1242 	kfree(cl);
1243 }
1244 
1245 static void htb_destroy(struct Qdisc *sch)
1246 {
1247 	struct htb_sched *q = qdisc_priv(sch);
1248 	struct hlist_node *next;
1249 	struct htb_class *cl;
1250 	unsigned int i;
1251 
1252 	cancel_work_sync(&q->work);
1253 	qdisc_watchdog_cancel(&q->watchdog);
1254 	/* This line used to be after htb_destroy_class call below
1255 	 * and surprisingly it worked in 2.4. But it must precede it
1256 	 * because filter need its target class alive to be able to call
1257 	 * unbind_filter on it (without Oops).
1258 	 */
1259 	tcf_block_put(q->block);
1260 
1261 	for (i = 0; i < q->clhash.hashsize; i++) {
1262 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1263 			tcf_block_put(cl->block);
1264 			cl->block = NULL;
1265 		}
1266 	}
1267 	for (i = 0; i < q->clhash.hashsize; i++) {
1268 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1269 					  common.hnode)
1270 			htb_destroy_class(sch, cl);
1271 	}
1272 	qdisc_class_hash_destroy(&q->clhash);
1273 	__qdisc_reset_queue(&q->direct_queue);
1274 }
1275 
1276 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1277 {
1278 	struct htb_sched *q = qdisc_priv(sch);
1279 	struct htb_class *cl = (struct htb_class *)arg;
1280 	struct Qdisc *new_q = NULL;
1281 	int last_child = 0;
1282 
1283 	/* TODO: why don't allow to delete subtree ? references ? does
1284 	 * tc subsys guarantee us that in htb_destroy it holds no class
1285 	 * refs so that we can remove children safely there ?
1286 	 */
1287 	if (cl->children || cl->filter_cnt)
1288 		return -EBUSY;
1289 
1290 	if (!cl->level && htb_parent_last_child(cl)) {
1291 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1292 					  cl->parent->common.classid);
1293 		last_child = 1;
1294 	}
1295 
1296 	sch_tree_lock(sch);
1297 
1298 	if (!cl->level) {
1299 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1300 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1301 
1302 		qdisc_reset(cl->un.leaf.q);
1303 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1304 	}
1305 
1306 	/* delete from hash and active; remainder in destroy_class */
1307 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1308 	if (cl->parent)
1309 		cl->parent->children--;
1310 
1311 	if (cl->prio_activity)
1312 		htb_deactivate(q, cl);
1313 
1314 	if (cl->cmode != HTB_CAN_SEND)
1315 		htb_safe_rb_erase(&cl->pq_node,
1316 				  &q->hlevel[cl->level].wait_pq);
1317 
1318 	if (last_child)
1319 		htb_parent_to_leaf(q, cl, new_q);
1320 
1321 	sch_tree_unlock(sch);
1322 
1323 	htb_destroy_class(sch, cl);
1324 	return 0;
1325 }
1326 
1327 static int htb_change_class(struct Qdisc *sch, u32 classid,
1328 			    u32 parentid, struct nlattr **tca,
1329 			    unsigned long *arg)
1330 {
1331 	int err = -EINVAL;
1332 	struct htb_sched *q = qdisc_priv(sch);
1333 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1334 	struct nlattr *opt = tca[TCA_OPTIONS];
1335 	struct nlattr *tb[TCA_HTB_MAX + 1];
1336 	struct tc_htb_opt *hopt;
1337 	u64 rate64, ceil64;
1338 
1339 	/* extract all subattrs from opt attr */
1340 	if (!opt)
1341 		goto failure;
1342 
1343 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1344 	if (err < 0)
1345 		goto failure;
1346 
1347 	err = -EINVAL;
1348 	if (tb[TCA_HTB_PARMS] == NULL)
1349 		goto failure;
1350 
1351 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1352 
1353 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1354 	if (!hopt->rate.rate || !hopt->ceil.rate)
1355 		goto failure;
1356 
1357 	/* Keeping backward compatible with rate_table based iproute2 tc */
1358 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1359 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1360 
1361 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1362 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1363 
1364 	if (!cl) {		/* new class */
1365 		struct Qdisc *new_q;
1366 		int prio;
1367 		struct {
1368 			struct nlattr		nla;
1369 			struct gnet_estimator	opt;
1370 		} est = {
1371 			.nla = {
1372 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1373 				.nla_type	= TCA_RATE,
1374 			},
1375 			.opt = {
1376 				/* 4s interval, 16s averaging constant */
1377 				.interval	= 2,
1378 				.ewma_log	= 2,
1379 			},
1380 		};
1381 
1382 		/* check for valid classid */
1383 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1384 		    htb_find(classid, sch))
1385 			goto failure;
1386 
1387 		/* check maximal depth */
1388 		if (parent && parent->parent && parent->parent->level < 2) {
1389 			pr_err("htb: tree is too deep\n");
1390 			goto failure;
1391 		}
1392 		err = -ENOBUFS;
1393 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1394 		if (!cl)
1395 			goto failure;
1396 
1397 		err = tcf_block_get(&cl->block, &cl->filter_list, sch);
1398 		if (err) {
1399 			kfree(cl);
1400 			goto failure;
1401 		}
1402 		if (htb_rate_est || tca[TCA_RATE]) {
1403 			err = gen_new_estimator(&cl->bstats, NULL,
1404 						&cl->rate_est,
1405 						NULL,
1406 						qdisc_root_sleeping_running(sch),
1407 						tca[TCA_RATE] ? : &est.nla);
1408 			if (err) {
1409 				tcf_block_put(cl->block);
1410 				kfree(cl);
1411 				goto failure;
1412 			}
1413 		}
1414 
1415 		cl->children = 0;
1416 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1417 		RB_CLEAR_NODE(&cl->pq_node);
1418 
1419 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1420 			RB_CLEAR_NODE(&cl->node[prio]);
1421 
1422 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1423 		 * so that can't be used inside of sch_tree_lock
1424 		 * -- thanks to Karlis Peisenieks
1425 		 */
1426 		new_q = qdisc_create_dflt(sch->dev_queue,
1427 					  &pfifo_qdisc_ops, classid);
1428 		sch_tree_lock(sch);
1429 		if (parent && !parent->level) {
1430 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1431 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1432 
1433 			/* turn parent into inner node */
1434 			qdisc_reset(parent->un.leaf.q);
1435 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1436 			qdisc_destroy(parent->un.leaf.q);
1437 			if (parent->prio_activity)
1438 				htb_deactivate(q, parent);
1439 
1440 			/* remove from evt list because of level change */
1441 			if (parent->cmode != HTB_CAN_SEND) {
1442 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1443 				parent->cmode = HTB_CAN_SEND;
1444 			}
1445 			parent->level = (parent->parent ? parent->parent->level
1446 					 : TC_HTB_MAXDEPTH) - 1;
1447 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1448 		}
1449 		/* leaf (we) needs elementary qdisc */
1450 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1451 
1452 		cl->common.classid = classid;
1453 		cl->parent = parent;
1454 
1455 		/* set class to be in HTB_CAN_SEND state */
1456 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1457 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1458 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1459 		cl->t_c = ktime_get_ns();
1460 		cl->cmode = HTB_CAN_SEND;
1461 
1462 		/* attach to the hash list and parent's family */
1463 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1464 		if (parent)
1465 			parent->children++;
1466 		if (cl->un.leaf.q != &noop_qdisc)
1467 			qdisc_hash_add(cl->un.leaf.q, true);
1468 	} else {
1469 		if (tca[TCA_RATE]) {
1470 			err = gen_replace_estimator(&cl->bstats, NULL,
1471 						    &cl->rate_est,
1472 						    NULL,
1473 						    qdisc_root_sleeping_running(sch),
1474 						    tca[TCA_RATE]);
1475 			if (err)
1476 				return err;
1477 		}
1478 		sch_tree_lock(sch);
1479 	}
1480 
1481 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1482 
1483 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1484 
1485 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1486 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1487 
1488 	/* it used to be a nasty bug here, we have to check that node
1489 	 * is really leaf before changing cl->un.leaf !
1490 	 */
1491 	if (!cl->level) {
1492 		u64 quantum = cl->rate.rate_bytes_ps;
1493 
1494 		do_div(quantum, q->rate2quantum);
1495 		cl->quantum = min_t(u64, quantum, INT_MAX);
1496 
1497 		if (!hopt->quantum && cl->quantum < 1000) {
1498 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1499 				cl->common.classid);
1500 			cl->quantum = 1000;
1501 		}
1502 		if (!hopt->quantum && cl->quantum > 200000) {
1503 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1504 				cl->common.classid);
1505 			cl->quantum = 200000;
1506 		}
1507 		if (hopt->quantum)
1508 			cl->quantum = hopt->quantum;
1509 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1510 			cl->prio = TC_HTB_NUMPRIO - 1;
1511 	}
1512 
1513 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1514 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1515 
1516 	sch_tree_unlock(sch);
1517 
1518 	qdisc_class_hash_grow(sch, &q->clhash);
1519 
1520 	*arg = (unsigned long)cl;
1521 	return 0;
1522 
1523 failure:
1524 	return err;
1525 }
1526 
1527 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1528 {
1529 	struct htb_sched *q = qdisc_priv(sch);
1530 	struct htb_class *cl = (struct htb_class *)arg;
1531 
1532 	return cl ? cl->block : q->block;
1533 }
1534 
1535 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1536 				     u32 classid)
1537 {
1538 	struct htb_class *cl = htb_find(classid, sch);
1539 
1540 	/*if (cl && !cl->level) return 0;
1541 	 * The line above used to be there to prevent attaching filters to
1542 	 * leaves. But at least tc_index filter uses this just to get class
1543 	 * for other reasons so that we have to allow for it.
1544 	 * ----
1545 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1546 	 * another way to "lock" the class - unlike "get" this lock can
1547 	 * be broken by class during destroy IIUC.
1548 	 */
1549 	if (cl)
1550 		cl->filter_cnt++;
1551 	return (unsigned long)cl;
1552 }
1553 
1554 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1555 {
1556 	struct htb_class *cl = (struct htb_class *)arg;
1557 
1558 	if (cl)
1559 		cl->filter_cnt--;
1560 }
1561 
1562 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1563 {
1564 	struct htb_sched *q = qdisc_priv(sch);
1565 	struct htb_class *cl;
1566 	unsigned int i;
1567 
1568 	if (arg->stop)
1569 		return;
1570 
1571 	for (i = 0; i < q->clhash.hashsize; i++) {
1572 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1573 			if (arg->count < arg->skip) {
1574 				arg->count++;
1575 				continue;
1576 			}
1577 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1578 				arg->stop = 1;
1579 				return;
1580 			}
1581 			arg->count++;
1582 		}
1583 	}
1584 }
1585 
1586 static const struct Qdisc_class_ops htb_class_ops = {
1587 	.graft		=	htb_graft,
1588 	.leaf		=	htb_leaf,
1589 	.qlen_notify	=	htb_qlen_notify,
1590 	.find		=	htb_search,
1591 	.change		=	htb_change_class,
1592 	.delete		=	htb_delete,
1593 	.walk		=	htb_walk,
1594 	.tcf_block	=	htb_tcf_block,
1595 	.bind_tcf	=	htb_bind_filter,
1596 	.unbind_tcf	=	htb_unbind_filter,
1597 	.dump		=	htb_dump_class,
1598 	.dump_stats	=	htb_dump_class_stats,
1599 };
1600 
1601 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1602 	.cl_ops		=	&htb_class_ops,
1603 	.id		=	"htb",
1604 	.priv_size	=	sizeof(struct htb_sched),
1605 	.enqueue	=	htb_enqueue,
1606 	.dequeue	=	htb_dequeue,
1607 	.peek		=	qdisc_peek_dequeued,
1608 	.init		=	htb_init,
1609 	.reset		=	htb_reset,
1610 	.destroy	=	htb_destroy,
1611 	.dump		=	htb_dump,
1612 	.owner		=	THIS_MODULE,
1613 };
1614 
1615 static int __init htb_module_init(void)
1616 {
1617 	return register_qdisc(&htb_qdisc_ops);
1618 }
1619 static void __exit htb_module_exit(void)
1620 {
1621 	unregister_qdisc(&htb_qdisc_ops);
1622 }
1623 
1624 module_init(htb_module_init)
1625 module_exit(htb_module_exit)
1626 MODULE_LICENSE("GPL");
1627