xref: /openbmc/linux/net/sched/sch_htb.c (revision e983940270f10fe8551baf0098be76ea478294a3)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 
44 /* HTB algorithm.
45     Author: devik@cdi.cz
46     ========================================================================
47     HTB is like TBF with multiple classes. It is also similar to CBQ because
48     it allows to assign priority to each class in hierarchy.
49     In fact it is another implementation of Floyd's formal sharing.
50 
51     Levels:
52     Each class is assigned level. Leaf has ALWAYS level 0 and root
53     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
54     one less than their parent.
55 */
56 
57 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
58 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
59 
60 #if HTB_VER >> 16 != TC_HTB_PROTOVER
61 #error "Mismatched sch_htb.c and pkt_sch.h"
62 #endif
63 
64 /* Module parameter and sysfs export */
65 module_param    (htb_hysteresis, int, 0640);
66 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
67 
68 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
69 module_param(htb_rate_est, int, 0640);
70 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
71 
72 /* used internaly to keep status of single class */
73 enum htb_cmode {
74 	HTB_CANT_SEND,		/* class can't send and can't borrow */
75 	HTB_MAY_BORROW,		/* class can't send but may borrow */
76 	HTB_CAN_SEND		/* class can send */
77 };
78 
79 struct htb_prio {
80 	union {
81 		struct rb_root	row;
82 		struct rb_root	feed;
83 	};
84 	struct rb_node	*ptr;
85 	/* When class changes from state 1->2 and disconnects from
86 	 * parent's feed then we lost ptr value and start from the
87 	 * first child again. Here we store classid of the
88 	 * last valid ptr (used when ptr is NULL).
89 	 */
90 	u32		last_ptr_id;
91 };
92 
93 /* interior & leaf nodes; props specific to leaves are marked L:
94  * To reduce false sharing, place mostly read fields at beginning,
95  * and mostly written ones at the end.
96  */
97 struct htb_class {
98 	struct Qdisc_class_common common;
99 	struct psched_ratecfg	rate;
100 	struct psched_ratecfg	ceil;
101 	s64			buffer, cbuffer;/* token bucket depth/rate */
102 	s64			mbuffer;	/* max wait time */
103 	u32			prio;		/* these two are used only by leaves... */
104 	int			quantum;	/* but stored for parent-to-leaf return */
105 
106 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
107 	int			filter_cnt;
108 	int			refcnt;		/* usage count of this class */
109 
110 	int			level;		/* our level (see above) */
111 	unsigned int		children;
112 	struct htb_class	*parent;	/* parent class */
113 
114 	struct gnet_stats_rate_est64 rate_est;
115 
116 	/*
117 	 * Written often fields
118 	 */
119 	struct gnet_stats_basic_packed bstats;
120 	struct tc_htb_xstats	xstats;	/* our special stats */
121 
122 	/* token bucket parameters */
123 	s64			tokens, ctokens;/* current number of tokens */
124 	s64			t_c;		/* checkpoint time */
125 
126 	union {
127 		struct htb_class_leaf {
128 			struct list_head drop_list;
129 			int		deficit[TC_HTB_MAXDEPTH];
130 			struct Qdisc	*q;
131 		} leaf;
132 		struct htb_class_inner {
133 			struct htb_prio clprio[TC_HTB_NUMPRIO];
134 		} inner;
135 	} un;
136 	s64			pq_key;
137 
138 	int			prio_activity;	/* for which prios are we active */
139 	enum htb_cmode		cmode;		/* current mode of the class */
140 	struct rb_node		pq_node;	/* node for event queue */
141 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
142 
143 	unsigned int drops ____cacheline_aligned_in_smp;
144 };
145 
146 struct htb_level {
147 	struct rb_root	wait_pq;
148 	struct htb_prio hprio[TC_HTB_NUMPRIO];
149 };
150 
151 struct htb_sched {
152 	struct Qdisc_class_hash clhash;
153 	int			defcls;		/* class where unclassified flows go to */
154 	int			rate2quantum;	/* quant = rate / rate2quantum */
155 
156 	/* filters for qdisc itself */
157 	struct tcf_proto __rcu	*filter_list;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	long			direct_pkts;
167 
168 	struct qdisc_watchdog	watchdog;
169 
170 	s64			now;	/* cached dequeue time */
171 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 };
180 
181 /* find class in global hash table using given handle */
182 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
183 {
184 	struct htb_sched *q = qdisc_priv(sch);
185 	struct Qdisc_class_common *clc;
186 
187 	clc = qdisc_class_find(&q->clhash, handle);
188 	if (clc == NULL)
189 		return NULL;
190 	return container_of(clc, struct htb_class, common);
191 }
192 
193 /**
194  * htb_classify - classify a packet into class
195  *
196  * It returns NULL if the packet should be dropped or -1 if the packet
197  * should be passed directly thru. In all other cases leaf class is returned.
198  * We allow direct class selection by classid in priority. The we examine
199  * filters in qdisc and in inner nodes (if higher filter points to the inner
200  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
201  * internal fifo (direct). These packets then go directly thru. If we still
202  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
203  * then finish and return direct queue.
204  */
205 #define HTB_DIRECT ((struct htb_class *)-1L)
206 
207 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
208 				      int *qerr)
209 {
210 	struct htb_sched *q = qdisc_priv(sch);
211 	struct htb_class *cl;
212 	struct tcf_result res;
213 	struct tcf_proto *tcf;
214 	int result;
215 
216 	/* allow to select class by setting skb->priority to valid classid;
217 	 * note that nfmark can be used too by attaching filter fw with no
218 	 * rules in it
219 	 */
220 	if (skb->priority == sch->handle)
221 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
222 	cl = htb_find(skb->priority, sch);
223 	if (cl) {
224 		if (cl->level == 0)
225 			return cl;
226 		/* Start with inner filter chain if a non-leaf class is selected */
227 		tcf = rcu_dereference_bh(cl->filter_list);
228 	} else {
229 		tcf = rcu_dereference_bh(q->filter_list);
230 	}
231 
232 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
233 	while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) {
234 #ifdef CONFIG_NET_CLS_ACT
235 		switch (result) {
236 		case TC_ACT_QUEUED:
237 		case TC_ACT_STOLEN:
238 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
239 		case TC_ACT_SHOT:
240 			return NULL;
241 		}
242 #endif
243 		cl = (void *)res.class;
244 		if (!cl) {
245 			if (res.classid == sch->handle)
246 				return HTB_DIRECT;	/* X:0 (direct flow) */
247 			cl = htb_find(res.classid, sch);
248 			if (!cl)
249 				break;	/* filter selected invalid classid */
250 		}
251 		if (!cl->level)
252 			return cl;	/* we hit leaf; return it */
253 
254 		/* we have got inner class; apply inner filter chain */
255 		tcf = rcu_dereference_bh(cl->filter_list);
256 	}
257 	/* classification failed; try to use default class */
258 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
259 	if (!cl || cl->level)
260 		return HTB_DIRECT;	/* bad default .. this is safe bet */
261 	return cl;
262 }
263 
264 /**
265  * htb_add_to_id_tree - adds class to the round robin list
266  *
267  * Routine adds class to the list (actually tree) sorted by classid.
268  * Make sure that class is not already on such list for given prio.
269  */
270 static void htb_add_to_id_tree(struct rb_root *root,
271 			       struct htb_class *cl, int prio)
272 {
273 	struct rb_node **p = &root->rb_node, *parent = NULL;
274 
275 	while (*p) {
276 		struct htb_class *c;
277 		parent = *p;
278 		c = rb_entry(parent, struct htb_class, node[prio]);
279 
280 		if (cl->common.classid > c->common.classid)
281 			p = &parent->rb_right;
282 		else
283 			p = &parent->rb_left;
284 	}
285 	rb_link_node(&cl->node[prio], parent, p);
286 	rb_insert_color(&cl->node[prio], root);
287 }
288 
289 /**
290  * htb_add_to_wait_tree - adds class to the event queue with delay
291  *
292  * The class is added to priority event queue to indicate that class will
293  * change its mode in cl->pq_key microseconds. Make sure that class is not
294  * already in the queue.
295  */
296 static void htb_add_to_wait_tree(struct htb_sched *q,
297 				 struct htb_class *cl, s64 delay)
298 {
299 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
300 
301 	cl->pq_key = q->now + delay;
302 	if (cl->pq_key == q->now)
303 		cl->pq_key++;
304 
305 	/* update the nearest event cache */
306 	if (q->near_ev_cache[cl->level] > cl->pq_key)
307 		q->near_ev_cache[cl->level] = cl->pq_key;
308 
309 	while (*p) {
310 		struct htb_class *c;
311 		parent = *p;
312 		c = rb_entry(parent, struct htb_class, pq_node);
313 		if (cl->pq_key >= c->pq_key)
314 			p = &parent->rb_right;
315 		else
316 			p = &parent->rb_left;
317 	}
318 	rb_link_node(&cl->pq_node, parent, p);
319 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
320 }
321 
322 /**
323  * htb_next_rb_node - finds next node in binary tree
324  *
325  * When we are past last key we return NULL.
326  * Average complexity is 2 steps per call.
327  */
328 static inline void htb_next_rb_node(struct rb_node **n)
329 {
330 	*n = rb_next(*n);
331 }
332 
333 /**
334  * htb_add_class_to_row - add class to its row
335  *
336  * The class is added to row at priorities marked in mask.
337  * It does nothing if mask == 0.
338  */
339 static inline void htb_add_class_to_row(struct htb_sched *q,
340 					struct htb_class *cl, int mask)
341 {
342 	q->row_mask[cl->level] |= mask;
343 	while (mask) {
344 		int prio = ffz(~mask);
345 		mask &= ~(1 << prio);
346 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
347 	}
348 }
349 
350 /* If this triggers, it is a bug in this code, but it need not be fatal */
351 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
352 {
353 	if (RB_EMPTY_NODE(rb)) {
354 		WARN_ON(1);
355 	} else {
356 		rb_erase(rb, root);
357 		RB_CLEAR_NODE(rb);
358 	}
359 }
360 
361 
362 /**
363  * htb_remove_class_from_row - removes class from its row
364  *
365  * The class is removed from row at priorities marked in mask.
366  * It does nothing if mask == 0.
367  */
368 static inline void htb_remove_class_from_row(struct htb_sched *q,
369 						 struct htb_class *cl, int mask)
370 {
371 	int m = 0;
372 	struct htb_level *hlevel = &q->hlevel[cl->level];
373 
374 	while (mask) {
375 		int prio = ffz(~mask);
376 		struct htb_prio *hprio = &hlevel->hprio[prio];
377 
378 		mask &= ~(1 << prio);
379 		if (hprio->ptr == cl->node + prio)
380 			htb_next_rb_node(&hprio->ptr);
381 
382 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
383 		if (!hprio->row.rb_node)
384 			m |= 1 << prio;
385 	}
386 	q->row_mask[cl->level] &= ~m;
387 }
388 
389 /**
390  * htb_activate_prios - creates active classe's feed chain
391  *
392  * The class is connected to ancestors and/or appropriate rows
393  * for priorities it is participating on. cl->cmode must be new
394  * (activated) mode. It does nothing if cl->prio_activity == 0.
395  */
396 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
397 {
398 	struct htb_class *p = cl->parent;
399 	long m, mask = cl->prio_activity;
400 
401 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
402 		m = mask;
403 		while (m) {
404 			int prio = ffz(~m);
405 			m &= ~(1 << prio);
406 
407 			if (p->un.inner.clprio[prio].feed.rb_node)
408 				/* parent already has its feed in use so that
409 				 * reset bit in mask as parent is already ok
410 				 */
411 				mask &= ~(1 << prio);
412 
413 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
414 		}
415 		p->prio_activity |= mask;
416 		cl = p;
417 		p = cl->parent;
418 
419 	}
420 	if (cl->cmode == HTB_CAN_SEND && mask)
421 		htb_add_class_to_row(q, cl, mask);
422 }
423 
424 /**
425  * htb_deactivate_prios - remove class from feed chain
426  *
427  * cl->cmode must represent old mode (before deactivation). It does
428  * nothing if cl->prio_activity == 0. Class is removed from all feed
429  * chains and rows.
430  */
431 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
432 {
433 	struct htb_class *p = cl->parent;
434 	long m, mask = cl->prio_activity;
435 
436 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
437 		m = mask;
438 		mask = 0;
439 		while (m) {
440 			int prio = ffz(~m);
441 			m &= ~(1 << prio);
442 
443 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
444 				/* we are removing child which is pointed to from
445 				 * parent feed - forget the pointer but remember
446 				 * classid
447 				 */
448 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
449 				p->un.inner.clprio[prio].ptr = NULL;
450 			}
451 
452 			htb_safe_rb_erase(cl->node + prio,
453 					  &p->un.inner.clprio[prio].feed);
454 
455 			if (!p->un.inner.clprio[prio].feed.rb_node)
456 				mask |= 1 << prio;
457 		}
458 
459 		p->prio_activity &= ~mask;
460 		cl = p;
461 		p = cl->parent;
462 
463 	}
464 	if (cl->cmode == HTB_CAN_SEND && mask)
465 		htb_remove_class_from_row(q, cl, mask);
466 }
467 
468 static inline s64 htb_lowater(const struct htb_class *cl)
469 {
470 	if (htb_hysteresis)
471 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
472 	else
473 		return 0;
474 }
475 static inline s64 htb_hiwater(const struct htb_class *cl)
476 {
477 	if (htb_hysteresis)
478 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
479 	else
480 		return 0;
481 }
482 
483 
484 /**
485  * htb_class_mode - computes and returns current class mode
486  *
487  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
488  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
489  * from now to time when cl will change its state.
490  * Also it is worth to note that class mode doesn't change simply
491  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
492  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
493  * mode transitions per time unit. The speed gain is about 1/6.
494  */
495 static inline enum htb_cmode
496 htb_class_mode(struct htb_class *cl, s64 *diff)
497 {
498 	s64 toks;
499 
500 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
501 		*diff = -toks;
502 		return HTB_CANT_SEND;
503 	}
504 
505 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
506 		return HTB_CAN_SEND;
507 
508 	*diff = -toks;
509 	return HTB_MAY_BORROW;
510 }
511 
512 /**
513  * htb_change_class_mode - changes classe's mode
514  *
515  * This should be the only way how to change classe's mode under normal
516  * cirsumstances. Routine will update feed lists linkage, change mode
517  * and add class to the wait event queue if appropriate. New mode should
518  * be different from old one and cl->pq_key has to be valid if changing
519  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
520  */
521 static void
522 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
523 {
524 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
525 
526 	if (new_mode == cl->cmode)
527 		return;
528 
529 	if (cl->prio_activity) {	/* not necessary: speed optimization */
530 		if (cl->cmode != HTB_CANT_SEND)
531 			htb_deactivate_prios(q, cl);
532 		cl->cmode = new_mode;
533 		if (new_mode != HTB_CANT_SEND)
534 			htb_activate_prios(q, cl);
535 	} else
536 		cl->cmode = new_mode;
537 }
538 
539 /**
540  * htb_activate - inserts leaf cl into appropriate active feeds
541  *
542  * Routine learns (new) priority of leaf and activates feed chain
543  * for the prio. It can be called on already active leaf safely.
544  * It also adds leaf into droplist.
545  */
546 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
547 {
548 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
549 
550 	if (!cl->prio_activity) {
551 		cl->prio_activity = 1 << cl->prio;
552 		htb_activate_prios(q, cl);
553 		list_add_tail(&cl->un.leaf.drop_list,
554 			      q->drops + cl->prio);
555 	}
556 }
557 
558 /**
559  * htb_deactivate - remove leaf cl from active feeds
560  *
561  * Make sure that leaf is active. In the other words it can't be called
562  * with non-active leaf. It also removes class from the drop list.
563  */
564 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
565 {
566 	WARN_ON(!cl->prio_activity);
567 
568 	htb_deactivate_prios(q, cl);
569 	cl->prio_activity = 0;
570 	list_del_init(&cl->un.leaf.drop_list);
571 }
572 
573 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
574 			     struct qdisc_skb_head *qh)
575 {
576 	struct sk_buff *last = qh->tail;
577 
578 	if (last) {
579 		skb->next = NULL;
580 		last->next = skb;
581 		qh->tail = skb;
582 	} else {
583 		qh->tail = skb;
584 		qh->head = skb;
585 	}
586 	qh->qlen++;
587 }
588 
589 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
590 		       struct sk_buff **to_free)
591 {
592 	int uninitialized_var(ret);
593 	struct htb_sched *q = qdisc_priv(sch);
594 	struct htb_class *cl = htb_classify(skb, sch, &ret);
595 
596 	if (cl == HTB_DIRECT) {
597 		/* enqueue to helper queue */
598 		if (q->direct_queue.qlen < q->direct_qlen) {
599 			htb_enqueue_tail(skb, sch, &q->direct_queue);
600 			q->direct_pkts++;
601 		} else {
602 			return qdisc_drop(skb, sch, to_free);
603 		}
604 #ifdef CONFIG_NET_CLS_ACT
605 	} else if (!cl) {
606 		if (ret & __NET_XMIT_BYPASS)
607 			qdisc_qstats_drop(sch);
608 		__qdisc_drop(skb, to_free);
609 		return ret;
610 #endif
611 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
612 					to_free)) != NET_XMIT_SUCCESS) {
613 		if (net_xmit_drop_count(ret)) {
614 			qdisc_qstats_drop(sch);
615 			cl->drops++;
616 		}
617 		return ret;
618 	} else {
619 		htb_activate(q, cl);
620 	}
621 
622 	qdisc_qstats_backlog_inc(sch, skb);
623 	sch->q.qlen++;
624 	return NET_XMIT_SUCCESS;
625 }
626 
627 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
628 {
629 	s64 toks = diff + cl->tokens;
630 
631 	if (toks > cl->buffer)
632 		toks = cl->buffer;
633 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
634 	if (toks <= -cl->mbuffer)
635 		toks = 1 - cl->mbuffer;
636 
637 	cl->tokens = toks;
638 }
639 
640 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
641 {
642 	s64 toks = diff + cl->ctokens;
643 
644 	if (toks > cl->cbuffer)
645 		toks = cl->cbuffer;
646 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
647 	if (toks <= -cl->mbuffer)
648 		toks = 1 - cl->mbuffer;
649 
650 	cl->ctokens = toks;
651 }
652 
653 /**
654  * htb_charge_class - charges amount "bytes" to leaf and ancestors
655  *
656  * Routine assumes that packet "bytes" long was dequeued from leaf cl
657  * borrowing from "level". It accounts bytes to ceil leaky bucket for
658  * leaf and all ancestors and to rate bucket for ancestors at levels
659  * "level" and higher. It also handles possible change of mode resulting
660  * from the update. Note that mode can also increase here (MAY_BORROW to
661  * CAN_SEND) because we can use more precise clock that event queue here.
662  * In such case we remove class from event queue first.
663  */
664 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
665 			     int level, struct sk_buff *skb)
666 {
667 	int bytes = qdisc_pkt_len(skb);
668 	enum htb_cmode old_mode;
669 	s64 diff;
670 
671 	while (cl) {
672 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
673 		if (cl->level >= level) {
674 			if (cl->level == level)
675 				cl->xstats.lends++;
676 			htb_accnt_tokens(cl, bytes, diff);
677 		} else {
678 			cl->xstats.borrows++;
679 			cl->tokens += diff;	/* we moved t_c; update tokens */
680 		}
681 		htb_accnt_ctokens(cl, bytes, diff);
682 		cl->t_c = q->now;
683 
684 		old_mode = cl->cmode;
685 		diff = 0;
686 		htb_change_class_mode(q, cl, &diff);
687 		if (old_mode != cl->cmode) {
688 			if (old_mode != HTB_CAN_SEND)
689 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
690 			if (cl->cmode != HTB_CAN_SEND)
691 				htb_add_to_wait_tree(q, cl, diff);
692 		}
693 
694 		/* update basic stats except for leaves which are already updated */
695 		if (cl->level)
696 			bstats_update(&cl->bstats, skb);
697 
698 		cl = cl->parent;
699 	}
700 }
701 
702 /**
703  * htb_do_events - make mode changes to classes at the level
704  *
705  * Scans event queue for pending events and applies them. Returns time of
706  * next pending event (0 for no event in pq, q->now for too many events).
707  * Note: Applied are events whose have cl->pq_key <= q->now.
708  */
709 static s64 htb_do_events(struct htb_sched *q, const int level,
710 			 unsigned long start)
711 {
712 	/* don't run for longer than 2 jiffies; 2 is used instead of
713 	 * 1 to simplify things when jiffy is going to be incremented
714 	 * too soon
715 	 */
716 	unsigned long stop_at = start + 2;
717 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
718 
719 	while (time_before(jiffies, stop_at)) {
720 		struct htb_class *cl;
721 		s64 diff;
722 		struct rb_node *p = rb_first(wait_pq);
723 
724 		if (!p)
725 			return 0;
726 
727 		cl = rb_entry(p, struct htb_class, pq_node);
728 		if (cl->pq_key > q->now)
729 			return cl->pq_key;
730 
731 		htb_safe_rb_erase(p, wait_pq);
732 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
733 		htb_change_class_mode(q, cl, &diff);
734 		if (cl->cmode != HTB_CAN_SEND)
735 			htb_add_to_wait_tree(q, cl, diff);
736 	}
737 
738 	/* too much load - let's continue after a break for scheduling */
739 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
740 		pr_warn("htb: too many events!\n");
741 		q->warned |= HTB_WARN_TOOMANYEVENTS;
742 	}
743 
744 	return q->now;
745 }
746 
747 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
748  * is no such one exists.
749  */
750 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
751 					      u32 id)
752 {
753 	struct rb_node *r = NULL;
754 	while (n) {
755 		struct htb_class *cl =
756 		    rb_entry(n, struct htb_class, node[prio]);
757 
758 		if (id > cl->common.classid) {
759 			n = n->rb_right;
760 		} else if (id < cl->common.classid) {
761 			r = n;
762 			n = n->rb_left;
763 		} else {
764 			return n;
765 		}
766 	}
767 	return r;
768 }
769 
770 /**
771  * htb_lookup_leaf - returns next leaf class in DRR order
772  *
773  * Find leaf where current feed pointers points to.
774  */
775 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
776 {
777 	int i;
778 	struct {
779 		struct rb_node *root;
780 		struct rb_node **pptr;
781 		u32 *pid;
782 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
783 
784 	BUG_ON(!hprio->row.rb_node);
785 	sp->root = hprio->row.rb_node;
786 	sp->pptr = &hprio->ptr;
787 	sp->pid = &hprio->last_ptr_id;
788 
789 	for (i = 0; i < 65535; i++) {
790 		if (!*sp->pptr && *sp->pid) {
791 			/* ptr was invalidated but id is valid - try to recover
792 			 * the original or next ptr
793 			 */
794 			*sp->pptr =
795 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
796 		}
797 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
798 				 * can become out of date quickly
799 				 */
800 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
801 			*sp->pptr = sp->root;
802 			while ((*sp->pptr)->rb_left)
803 				*sp->pptr = (*sp->pptr)->rb_left;
804 			if (sp > stk) {
805 				sp--;
806 				if (!*sp->pptr) {
807 					WARN_ON(1);
808 					return NULL;
809 				}
810 				htb_next_rb_node(sp->pptr);
811 			}
812 		} else {
813 			struct htb_class *cl;
814 			struct htb_prio *clp;
815 
816 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
817 			if (!cl->level)
818 				return cl;
819 			clp = &cl->un.inner.clprio[prio];
820 			(++sp)->root = clp->feed.rb_node;
821 			sp->pptr = &clp->ptr;
822 			sp->pid = &clp->last_ptr_id;
823 		}
824 	}
825 	WARN_ON(1);
826 	return NULL;
827 }
828 
829 /* dequeues packet at given priority and level; call only if
830  * you are sure that there is active class at prio/level
831  */
832 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
833 					const int level)
834 {
835 	struct sk_buff *skb = NULL;
836 	struct htb_class *cl, *start;
837 	struct htb_level *hlevel = &q->hlevel[level];
838 	struct htb_prio *hprio = &hlevel->hprio[prio];
839 
840 	/* look initial class up in the row */
841 	start = cl = htb_lookup_leaf(hprio, prio);
842 
843 	do {
844 next:
845 		if (unlikely(!cl))
846 			return NULL;
847 
848 		/* class can be empty - it is unlikely but can be true if leaf
849 		 * qdisc drops packets in enqueue routine or if someone used
850 		 * graft operation on the leaf since last dequeue;
851 		 * simply deactivate and skip such class
852 		 */
853 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
854 			struct htb_class *next;
855 			htb_deactivate(q, cl);
856 
857 			/* row/level might become empty */
858 			if ((q->row_mask[level] & (1 << prio)) == 0)
859 				return NULL;
860 
861 			next = htb_lookup_leaf(hprio, prio);
862 
863 			if (cl == start)	/* fix start if we just deleted it */
864 				start = next;
865 			cl = next;
866 			goto next;
867 		}
868 
869 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
870 		if (likely(skb != NULL))
871 			break;
872 
873 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
874 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
875 					 &q->hlevel[0].hprio[prio].ptr);
876 		cl = htb_lookup_leaf(hprio, prio);
877 
878 	} while (cl != start);
879 
880 	if (likely(skb != NULL)) {
881 		bstats_update(&cl->bstats, skb);
882 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
883 		if (cl->un.leaf.deficit[level] < 0) {
884 			cl->un.leaf.deficit[level] += cl->quantum;
885 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
886 						 &q->hlevel[0].hprio[prio].ptr);
887 		}
888 		/* this used to be after charge_class but this constelation
889 		 * gives us slightly better performance
890 		 */
891 		if (!cl->un.leaf.q->q.qlen)
892 			htb_deactivate(q, cl);
893 		htb_charge_class(q, cl, level, skb);
894 	}
895 	return skb;
896 }
897 
898 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
899 {
900 	struct sk_buff *skb;
901 	struct htb_sched *q = qdisc_priv(sch);
902 	int level;
903 	s64 next_event;
904 	unsigned long start_at;
905 
906 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
907 	skb = __qdisc_dequeue_head(&q->direct_queue);
908 	if (skb != NULL) {
909 ok:
910 		qdisc_bstats_update(sch, skb);
911 		qdisc_qstats_backlog_dec(sch, skb);
912 		sch->q.qlen--;
913 		return skb;
914 	}
915 
916 	if (!sch->q.qlen)
917 		goto fin;
918 	q->now = ktime_get_ns();
919 	start_at = jiffies;
920 
921 	next_event = q->now + 5LLU * NSEC_PER_SEC;
922 
923 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
924 		/* common case optimization - skip event handler quickly */
925 		int m;
926 		s64 event = q->near_ev_cache[level];
927 
928 		if (q->now >= event) {
929 			event = htb_do_events(q, level, start_at);
930 			if (!event)
931 				event = q->now + NSEC_PER_SEC;
932 			q->near_ev_cache[level] = event;
933 		}
934 
935 		if (next_event > event)
936 			next_event = event;
937 
938 		m = ~q->row_mask[level];
939 		while (m != (int)(-1)) {
940 			int prio = ffz(m);
941 
942 			m |= 1 << prio;
943 			skb = htb_dequeue_tree(q, prio, level);
944 			if (likely(skb != NULL))
945 				goto ok;
946 		}
947 	}
948 	qdisc_qstats_overlimit(sch);
949 	if (likely(next_event > q->now))
950 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
951 	else
952 		schedule_work(&q->work);
953 fin:
954 	return skb;
955 }
956 
957 /* reset all classes */
958 /* always caled under BH & queue lock */
959 static void htb_reset(struct Qdisc *sch)
960 {
961 	struct htb_sched *q = qdisc_priv(sch);
962 	struct htb_class *cl;
963 	unsigned int i;
964 
965 	for (i = 0; i < q->clhash.hashsize; i++) {
966 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
967 			if (cl->level)
968 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
969 			else {
970 				if (cl->un.leaf.q)
971 					qdisc_reset(cl->un.leaf.q);
972 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
973 			}
974 			cl->prio_activity = 0;
975 			cl->cmode = HTB_CAN_SEND;
976 		}
977 	}
978 	qdisc_watchdog_cancel(&q->watchdog);
979 	__qdisc_reset_queue(&q->direct_queue);
980 	sch->q.qlen = 0;
981 	sch->qstats.backlog = 0;
982 	memset(q->hlevel, 0, sizeof(q->hlevel));
983 	memset(q->row_mask, 0, sizeof(q->row_mask));
984 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
985 		INIT_LIST_HEAD(q->drops + i);
986 }
987 
988 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
989 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
990 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
991 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
992 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
993 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
994 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
995 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
996 };
997 
998 static void htb_work_func(struct work_struct *work)
999 {
1000 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1001 	struct Qdisc *sch = q->watchdog.qdisc;
1002 
1003 	rcu_read_lock();
1004 	__netif_schedule(qdisc_root(sch));
1005 	rcu_read_unlock();
1006 }
1007 
1008 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1009 {
1010 	struct htb_sched *q = qdisc_priv(sch);
1011 	struct nlattr *tb[TCA_HTB_MAX + 1];
1012 	struct tc_htb_glob *gopt;
1013 	int err;
1014 	int i;
1015 
1016 	if (!opt)
1017 		return -EINVAL;
1018 
1019 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1020 	if (err < 0)
1021 		return err;
1022 
1023 	if (!tb[TCA_HTB_INIT])
1024 		return -EINVAL;
1025 
1026 	gopt = nla_data(tb[TCA_HTB_INIT]);
1027 	if (gopt->version != HTB_VER >> 16)
1028 		return -EINVAL;
1029 
1030 	err = qdisc_class_hash_init(&q->clhash);
1031 	if (err < 0)
1032 		return err;
1033 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1034 		INIT_LIST_HEAD(q->drops + i);
1035 
1036 	qdisc_watchdog_init(&q->watchdog, sch);
1037 	INIT_WORK(&q->work, htb_work_func);
1038 	qdisc_skb_head_init(&q->direct_queue);
1039 
1040 	if (tb[TCA_HTB_DIRECT_QLEN])
1041 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1042 	else
1043 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1044 
1045 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1046 		q->rate2quantum = 1;
1047 	q->defcls = gopt->defcls;
1048 
1049 	return 0;
1050 }
1051 
1052 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1053 {
1054 	struct htb_sched *q = qdisc_priv(sch);
1055 	struct nlattr *nest;
1056 	struct tc_htb_glob gopt;
1057 
1058 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1059 	 * no change can happen on the qdisc parameters.
1060 	 */
1061 
1062 	gopt.direct_pkts = q->direct_pkts;
1063 	gopt.version = HTB_VER;
1064 	gopt.rate2quantum = q->rate2quantum;
1065 	gopt.defcls = q->defcls;
1066 	gopt.debug = 0;
1067 
1068 	nest = nla_nest_start(skb, TCA_OPTIONS);
1069 	if (nest == NULL)
1070 		goto nla_put_failure;
1071 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1072 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1073 		goto nla_put_failure;
1074 
1075 	return nla_nest_end(skb, nest);
1076 
1077 nla_put_failure:
1078 	nla_nest_cancel(skb, nest);
1079 	return -1;
1080 }
1081 
1082 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1083 			  struct sk_buff *skb, struct tcmsg *tcm)
1084 {
1085 	struct htb_class *cl = (struct htb_class *)arg;
1086 	struct nlattr *nest;
1087 	struct tc_htb_opt opt;
1088 
1089 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1090 	 * no change can happen on the class parameters.
1091 	 */
1092 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1093 	tcm->tcm_handle = cl->common.classid;
1094 	if (!cl->level && cl->un.leaf.q)
1095 		tcm->tcm_info = cl->un.leaf.q->handle;
1096 
1097 	nest = nla_nest_start(skb, TCA_OPTIONS);
1098 	if (nest == NULL)
1099 		goto nla_put_failure;
1100 
1101 	memset(&opt, 0, sizeof(opt));
1102 
1103 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1104 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1105 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1106 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1107 	opt.quantum = cl->quantum;
1108 	opt.prio = cl->prio;
1109 	opt.level = cl->level;
1110 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1111 		goto nla_put_failure;
1112 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1113 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1114 			      TCA_HTB_PAD))
1115 		goto nla_put_failure;
1116 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1117 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1118 			      TCA_HTB_PAD))
1119 		goto nla_put_failure;
1120 
1121 	return nla_nest_end(skb, nest);
1122 
1123 nla_put_failure:
1124 	nla_nest_cancel(skb, nest);
1125 	return -1;
1126 }
1127 
1128 static int
1129 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1130 {
1131 	struct htb_class *cl = (struct htb_class *)arg;
1132 	struct gnet_stats_queue qs = {
1133 		.drops = cl->drops,
1134 	};
1135 	__u32 qlen = 0;
1136 
1137 	if (!cl->level && cl->un.leaf.q) {
1138 		qlen = cl->un.leaf.q->q.qlen;
1139 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1140 	}
1141 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1142 				    INT_MIN, INT_MAX);
1143 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1144 				     INT_MIN, INT_MAX);
1145 
1146 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1147 				  d, NULL, &cl->bstats) < 0 ||
1148 	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1149 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1150 		return -1;
1151 
1152 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1153 }
1154 
1155 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1156 		     struct Qdisc **old)
1157 {
1158 	struct htb_class *cl = (struct htb_class *)arg;
1159 
1160 	if (cl->level)
1161 		return -EINVAL;
1162 	if (new == NULL &&
1163 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1164 				     cl->common.classid)) == NULL)
1165 		return -ENOBUFS;
1166 
1167 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1168 	return 0;
1169 }
1170 
1171 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1172 {
1173 	struct htb_class *cl = (struct htb_class *)arg;
1174 	return !cl->level ? cl->un.leaf.q : NULL;
1175 }
1176 
1177 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1178 {
1179 	struct htb_class *cl = (struct htb_class *)arg;
1180 
1181 	if (cl->un.leaf.q->q.qlen == 0)
1182 		htb_deactivate(qdisc_priv(sch), cl);
1183 }
1184 
1185 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1186 {
1187 	struct htb_class *cl = htb_find(classid, sch);
1188 	if (cl)
1189 		cl->refcnt++;
1190 	return (unsigned long)cl;
1191 }
1192 
1193 static inline int htb_parent_last_child(struct htb_class *cl)
1194 {
1195 	if (!cl->parent)
1196 		/* the root class */
1197 		return 0;
1198 	if (cl->parent->children > 1)
1199 		/* not the last child */
1200 		return 0;
1201 	return 1;
1202 }
1203 
1204 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1205 			       struct Qdisc *new_q)
1206 {
1207 	struct htb_class *parent = cl->parent;
1208 
1209 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1210 
1211 	if (parent->cmode != HTB_CAN_SEND)
1212 		htb_safe_rb_erase(&parent->pq_node,
1213 				  &q->hlevel[parent->level].wait_pq);
1214 
1215 	parent->level = 0;
1216 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1217 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1218 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1219 	parent->tokens = parent->buffer;
1220 	parent->ctokens = parent->cbuffer;
1221 	parent->t_c = ktime_get_ns();
1222 	parent->cmode = HTB_CAN_SEND;
1223 }
1224 
1225 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1226 {
1227 	if (!cl->level) {
1228 		WARN_ON(!cl->un.leaf.q);
1229 		qdisc_destroy(cl->un.leaf.q);
1230 	}
1231 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1232 	tcf_destroy_chain(&cl->filter_list);
1233 	kfree(cl);
1234 }
1235 
1236 static void htb_destroy(struct Qdisc *sch)
1237 {
1238 	struct htb_sched *q = qdisc_priv(sch);
1239 	struct hlist_node *next;
1240 	struct htb_class *cl;
1241 	unsigned int i;
1242 
1243 	cancel_work_sync(&q->work);
1244 	qdisc_watchdog_cancel(&q->watchdog);
1245 	/* This line used to be after htb_destroy_class call below
1246 	 * and surprisingly it worked in 2.4. But it must precede it
1247 	 * because filter need its target class alive to be able to call
1248 	 * unbind_filter on it (without Oops).
1249 	 */
1250 	tcf_destroy_chain(&q->filter_list);
1251 
1252 	for (i = 0; i < q->clhash.hashsize; i++) {
1253 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1254 			tcf_destroy_chain(&cl->filter_list);
1255 	}
1256 	for (i = 0; i < q->clhash.hashsize; i++) {
1257 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1258 					  common.hnode)
1259 			htb_destroy_class(sch, cl);
1260 	}
1261 	qdisc_class_hash_destroy(&q->clhash);
1262 	__qdisc_reset_queue(&q->direct_queue);
1263 }
1264 
1265 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1266 {
1267 	struct htb_sched *q = qdisc_priv(sch);
1268 	struct htb_class *cl = (struct htb_class *)arg;
1269 	struct Qdisc *new_q = NULL;
1270 	int last_child = 0;
1271 
1272 	/* TODO: why don't allow to delete subtree ? references ? does
1273 	 * tc subsys guarantee us that in htb_destroy it holds no class
1274 	 * refs so that we can remove children safely there ?
1275 	 */
1276 	if (cl->children || cl->filter_cnt)
1277 		return -EBUSY;
1278 
1279 	if (!cl->level && htb_parent_last_child(cl)) {
1280 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1281 					  cl->parent->common.classid);
1282 		last_child = 1;
1283 	}
1284 
1285 	sch_tree_lock(sch);
1286 
1287 	if (!cl->level) {
1288 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1289 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1290 
1291 		qdisc_reset(cl->un.leaf.q);
1292 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1293 	}
1294 
1295 	/* delete from hash and active; remainder in destroy_class */
1296 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1297 	if (cl->parent)
1298 		cl->parent->children--;
1299 
1300 	if (cl->prio_activity)
1301 		htb_deactivate(q, cl);
1302 
1303 	if (cl->cmode != HTB_CAN_SEND)
1304 		htb_safe_rb_erase(&cl->pq_node,
1305 				  &q->hlevel[cl->level].wait_pq);
1306 
1307 	if (last_child)
1308 		htb_parent_to_leaf(q, cl, new_q);
1309 
1310 	BUG_ON(--cl->refcnt == 0);
1311 	/*
1312 	 * This shouldn't happen: we "hold" one cops->get() when called
1313 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1314 	 */
1315 
1316 	sch_tree_unlock(sch);
1317 	return 0;
1318 }
1319 
1320 static void htb_put(struct Qdisc *sch, unsigned long arg)
1321 {
1322 	struct htb_class *cl = (struct htb_class *)arg;
1323 
1324 	if (--cl->refcnt == 0)
1325 		htb_destroy_class(sch, cl);
1326 }
1327 
1328 static int htb_change_class(struct Qdisc *sch, u32 classid,
1329 			    u32 parentid, struct nlattr **tca,
1330 			    unsigned long *arg)
1331 {
1332 	int err = -EINVAL;
1333 	struct htb_sched *q = qdisc_priv(sch);
1334 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1335 	struct nlattr *opt = tca[TCA_OPTIONS];
1336 	struct nlattr *tb[TCA_HTB_MAX + 1];
1337 	struct tc_htb_opt *hopt;
1338 	u64 rate64, ceil64;
1339 
1340 	/* extract all subattrs from opt attr */
1341 	if (!opt)
1342 		goto failure;
1343 
1344 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
1345 	if (err < 0)
1346 		goto failure;
1347 
1348 	err = -EINVAL;
1349 	if (tb[TCA_HTB_PARMS] == NULL)
1350 		goto failure;
1351 
1352 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1353 
1354 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1355 	if (!hopt->rate.rate || !hopt->ceil.rate)
1356 		goto failure;
1357 
1358 	/* Keeping backward compatible with rate_table based iproute2 tc */
1359 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1360 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1361 
1362 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1363 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1364 
1365 	if (!cl) {		/* new class */
1366 		struct Qdisc *new_q;
1367 		int prio;
1368 		struct {
1369 			struct nlattr		nla;
1370 			struct gnet_estimator	opt;
1371 		} est = {
1372 			.nla = {
1373 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1374 				.nla_type	= TCA_RATE,
1375 			},
1376 			.opt = {
1377 				/* 4s interval, 16s averaging constant */
1378 				.interval	= 2,
1379 				.ewma_log	= 2,
1380 			},
1381 		};
1382 
1383 		/* check for valid classid */
1384 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1385 		    htb_find(classid, sch))
1386 			goto failure;
1387 
1388 		/* check maximal depth */
1389 		if (parent && parent->parent && parent->parent->level < 2) {
1390 			pr_err("htb: tree is too deep\n");
1391 			goto failure;
1392 		}
1393 		err = -ENOBUFS;
1394 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1395 		if (!cl)
1396 			goto failure;
1397 
1398 		if (htb_rate_est || tca[TCA_RATE]) {
1399 			err = gen_new_estimator(&cl->bstats, NULL,
1400 						&cl->rate_est,
1401 						NULL,
1402 						qdisc_root_sleeping_running(sch),
1403 						tca[TCA_RATE] ? : &est.nla);
1404 			if (err) {
1405 				kfree(cl);
1406 				goto failure;
1407 			}
1408 		}
1409 
1410 		cl->refcnt = 1;
1411 		cl->children = 0;
1412 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1413 		RB_CLEAR_NODE(&cl->pq_node);
1414 
1415 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1416 			RB_CLEAR_NODE(&cl->node[prio]);
1417 
1418 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1419 		 * so that can't be used inside of sch_tree_lock
1420 		 * -- thanks to Karlis Peisenieks
1421 		 */
1422 		new_q = qdisc_create_dflt(sch->dev_queue,
1423 					  &pfifo_qdisc_ops, classid);
1424 		sch_tree_lock(sch);
1425 		if (parent && !parent->level) {
1426 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1427 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1428 
1429 			/* turn parent into inner node */
1430 			qdisc_reset(parent->un.leaf.q);
1431 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1432 			qdisc_destroy(parent->un.leaf.q);
1433 			if (parent->prio_activity)
1434 				htb_deactivate(q, parent);
1435 
1436 			/* remove from evt list because of level change */
1437 			if (parent->cmode != HTB_CAN_SEND) {
1438 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1439 				parent->cmode = HTB_CAN_SEND;
1440 			}
1441 			parent->level = (parent->parent ? parent->parent->level
1442 					 : TC_HTB_MAXDEPTH) - 1;
1443 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1444 		}
1445 		/* leaf (we) needs elementary qdisc */
1446 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1447 
1448 		cl->common.classid = classid;
1449 		cl->parent = parent;
1450 
1451 		/* set class to be in HTB_CAN_SEND state */
1452 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1453 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1454 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1455 		cl->t_c = ktime_get_ns();
1456 		cl->cmode = HTB_CAN_SEND;
1457 
1458 		/* attach to the hash list and parent's family */
1459 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1460 		if (parent)
1461 			parent->children++;
1462 	} else {
1463 		if (tca[TCA_RATE]) {
1464 			err = gen_replace_estimator(&cl->bstats, NULL,
1465 						    &cl->rate_est,
1466 						    NULL,
1467 						    qdisc_root_sleeping_running(sch),
1468 						    tca[TCA_RATE]);
1469 			if (err)
1470 				return err;
1471 		}
1472 		sch_tree_lock(sch);
1473 	}
1474 
1475 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1476 
1477 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1478 
1479 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1480 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1481 
1482 	/* it used to be a nasty bug here, we have to check that node
1483 	 * is really leaf before changing cl->un.leaf !
1484 	 */
1485 	if (!cl->level) {
1486 		u64 quantum = cl->rate.rate_bytes_ps;
1487 
1488 		do_div(quantum, q->rate2quantum);
1489 		cl->quantum = min_t(u64, quantum, INT_MAX);
1490 
1491 		if (!hopt->quantum && cl->quantum < 1000) {
1492 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1493 				cl->common.classid);
1494 			cl->quantum = 1000;
1495 		}
1496 		if (!hopt->quantum && cl->quantum > 200000) {
1497 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1498 				cl->common.classid);
1499 			cl->quantum = 200000;
1500 		}
1501 		if (hopt->quantum)
1502 			cl->quantum = hopt->quantum;
1503 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1504 			cl->prio = TC_HTB_NUMPRIO - 1;
1505 	}
1506 
1507 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1508 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1509 
1510 	sch_tree_unlock(sch);
1511 
1512 	qdisc_class_hash_grow(sch, &q->clhash);
1513 
1514 	*arg = (unsigned long)cl;
1515 	return 0;
1516 
1517 failure:
1518 	return err;
1519 }
1520 
1521 static struct tcf_proto __rcu **htb_find_tcf(struct Qdisc *sch,
1522 					     unsigned long arg)
1523 {
1524 	struct htb_sched *q = qdisc_priv(sch);
1525 	struct htb_class *cl = (struct htb_class *)arg;
1526 	struct tcf_proto __rcu **fl = cl ? &cl->filter_list : &q->filter_list;
1527 
1528 	return fl;
1529 }
1530 
1531 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1532 				     u32 classid)
1533 {
1534 	struct htb_class *cl = htb_find(classid, sch);
1535 
1536 	/*if (cl && !cl->level) return 0;
1537 	 * The line above used to be there to prevent attaching filters to
1538 	 * leaves. But at least tc_index filter uses this just to get class
1539 	 * for other reasons so that we have to allow for it.
1540 	 * ----
1541 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1542 	 * another way to "lock" the class - unlike "get" this lock can
1543 	 * be broken by class during destroy IIUC.
1544 	 */
1545 	if (cl)
1546 		cl->filter_cnt++;
1547 	return (unsigned long)cl;
1548 }
1549 
1550 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1551 {
1552 	struct htb_class *cl = (struct htb_class *)arg;
1553 
1554 	if (cl)
1555 		cl->filter_cnt--;
1556 }
1557 
1558 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1559 {
1560 	struct htb_sched *q = qdisc_priv(sch);
1561 	struct htb_class *cl;
1562 	unsigned int i;
1563 
1564 	if (arg->stop)
1565 		return;
1566 
1567 	for (i = 0; i < q->clhash.hashsize; i++) {
1568 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1569 			if (arg->count < arg->skip) {
1570 				arg->count++;
1571 				continue;
1572 			}
1573 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1574 				arg->stop = 1;
1575 				return;
1576 			}
1577 			arg->count++;
1578 		}
1579 	}
1580 }
1581 
1582 static const struct Qdisc_class_ops htb_class_ops = {
1583 	.graft		=	htb_graft,
1584 	.leaf		=	htb_leaf,
1585 	.qlen_notify	=	htb_qlen_notify,
1586 	.get		=	htb_get,
1587 	.put		=	htb_put,
1588 	.change		=	htb_change_class,
1589 	.delete		=	htb_delete,
1590 	.walk		=	htb_walk,
1591 	.tcf_chain	=	htb_find_tcf,
1592 	.bind_tcf	=	htb_bind_filter,
1593 	.unbind_tcf	=	htb_unbind_filter,
1594 	.dump		=	htb_dump_class,
1595 	.dump_stats	=	htb_dump_class_stats,
1596 };
1597 
1598 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1599 	.cl_ops		=	&htb_class_ops,
1600 	.id		=	"htb",
1601 	.priv_size	=	sizeof(struct htb_sched),
1602 	.enqueue	=	htb_enqueue,
1603 	.dequeue	=	htb_dequeue,
1604 	.peek		=	qdisc_peek_dequeued,
1605 	.init		=	htb_init,
1606 	.reset		=	htb_reset,
1607 	.destroy	=	htb_destroy,
1608 	.dump		=	htb_dump,
1609 	.owner		=	THIS_MODULE,
1610 };
1611 
1612 static int __init htb_module_init(void)
1613 {
1614 	return register_qdisc(&htb_qdisc_ops);
1615 }
1616 static void __exit htb_module_exit(void)
1617 {
1618 	unregister_qdisc(&htb_qdisc_ops);
1619 }
1620 
1621 module_init(htb_module_init)
1622 module_exit(htb_module_exit)
1623 MODULE_LICENSE("GPL");
1624