xref: /openbmc/linux/net/sched/sch_htb.c (revision 981ab3f1)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 #include <net/pkt_cls.h>
44 
45 /* HTB algorithm.
46     Author: devik@cdi.cz
47     ========================================================================
48     HTB is like TBF with multiple classes. It is also similar to CBQ because
49     it allows to assign priority to each class in hierarchy.
50     In fact it is another implementation of Floyd's formal sharing.
51 
52     Levels:
53     Each class is assigned level. Leaf has ALWAYS level 0 and root
54     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
55     one less than their parent.
56 */
57 
58 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
59 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
60 
61 #if HTB_VER >> 16 != TC_HTB_PROTOVER
62 #error "Mismatched sch_htb.c and pkt_sch.h"
63 #endif
64 
65 /* Module parameter and sysfs export */
66 module_param    (htb_hysteresis, int, 0640);
67 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
68 
69 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
70 module_param(htb_rate_est, int, 0640);
71 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
72 
73 /* used internaly to keep status of single class */
74 enum htb_cmode {
75 	HTB_CANT_SEND,		/* class can't send and can't borrow */
76 	HTB_MAY_BORROW,		/* class can't send but may borrow */
77 	HTB_CAN_SEND		/* class can send */
78 };
79 
80 struct htb_prio {
81 	union {
82 		struct rb_root	row;
83 		struct rb_root	feed;
84 	};
85 	struct rb_node	*ptr;
86 	/* When class changes from state 1->2 and disconnects from
87 	 * parent's feed then we lost ptr value and start from the
88 	 * first child again. Here we store classid of the
89 	 * last valid ptr (used when ptr is NULL).
90 	 */
91 	u32		last_ptr_id;
92 };
93 
94 /* interior & leaf nodes; props specific to leaves are marked L:
95  * To reduce false sharing, place mostly read fields at beginning,
96  * and mostly written ones at the end.
97  */
98 struct htb_class {
99 	struct Qdisc_class_common common;
100 	struct psched_ratecfg	rate;
101 	struct psched_ratecfg	ceil;
102 	s64			buffer, cbuffer;/* token bucket depth/rate */
103 	s64			mbuffer;	/* max wait time */
104 	u32			prio;		/* these two are used only by leaves... */
105 	int			quantum;	/* but stored for parent-to-leaf return */
106 
107 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
108 	struct tcf_block	*block;
109 	int			filter_cnt;
110 	int			refcnt;		/* usage count of this class */
111 
112 	int			level;		/* our level (see above) */
113 	unsigned int		children;
114 	struct htb_class	*parent;	/* parent class */
115 
116 	struct net_rate_estimator __rcu *rate_est;
117 
118 	/*
119 	 * Written often fields
120 	 */
121 	struct gnet_stats_basic_packed bstats;
122 	struct tc_htb_xstats	xstats;	/* our special stats */
123 
124 	/* token bucket parameters */
125 	s64			tokens, ctokens;/* current number of tokens */
126 	s64			t_c;		/* checkpoint time */
127 
128 	union {
129 		struct htb_class_leaf {
130 			struct list_head drop_list;
131 			int		deficit[TC_HTB_MAXDEPTH];
132 			struct Qdisc	*q;
133 		} leaf;
134 		struct htb_class_inner {
135 			struct htb_prio clprio[TC_HTB_NUMPRIO];
136 		} inner;
137 	} un;
138 	s64			pq_key;
139 
140 	int			prio_activity;	/* for which prios are we active */
141 	enum htb_cmode		cmode;		/* current mode of the class */
142 	struct rb_node		pq_node;	/* node for event queue */
143 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
144 
145 	unsigned int drops ____cacheline_aligned_in_smp;
146 };
147 
148 struct htb_level {
149 	struct rb_root	wait_pq;
150 	struct htb_prio hprio[TC_HTB_NUMPRIO];
151 };
152 
153 struct htb_sched {
154 	struct Qdisc_class_hash clhash;
155 	int			defcls;		/* class where unclassified flows go to */
156 	int			rate2quantum;	/* quant = rate / rate2quantum */
157 
158 	/* filters for qdisc itself */
159 	struct tcf_proto __rcu	*filter_list;
160 	struct tcf_block	*block;
161 
162 #define HTB_WARN_TOOMANYEVENTS	0x1
163 	unsigned int		warned;	/* only one warning */
164 	int			direct_qlen;
165 	struct work_struct	work;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct qdisc_skb_head	direct_queue;
169 	long			direct_pkts;
170 
171 	struct qdisc_watchdog	watchdog;
172 
173 	s64			now;	/* cached dequeue time */
174 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
175 
176 	/* time of nearest event per level (row) */
177 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
178 
179 	int			row_mask[TC_HTB_MAXDEPTH];
180 
181 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
182 };
183 
184 /* find class in global hash table using given handle */
185 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
186 {
187 	struct htb_sched *q = qdisc_priv(sch);
188 	struct Qdisc_class_common *clc;
189 
190 	clc = qdisc_class_find(&q->clhash, handle);
191 	if (clc == NULL)
192 		return NULL;
193 	return container_of(clc, struct htb_class, common);
194 }
195 
196 /**
197  * htb_classify - classify a packet into class
198  *
199  * It returns NULL if the packet should be dropped or -1 if the packet
200  * should be passed directly thru. In all other cases leaf class is returned.
201  * We allow direct class selection by classid in priority. The we examine
202  * filters in qdisc and in inner nodes (if higher filter points to the inner
203  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
204  * internal fifo (direct). These packets then go directly thru. If we still
205  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
206  * then finish and return direct queue.
207  */
208 #define HTB_DIRECT ((struct htb_class *)-1L)
209 
210 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
211 				      int *qerr)
212 {
213 	struct htb_sched *q = qdisc_priv(sch);
214 	struct htb_class *cl;
215 	struct tcf_result res;
216 	struct tcf_proto *tcf;
217 	int result;
218 
219 	/* allow to select class by setting skb->priority to valid classid;
220 	 * note that nfmark can be used too by attaching filter fw with no
221 	 * rules in it
222 	 */
223 	if (skb->priority == sch->handle)
224 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
225 	cl = htb_find(skb->priority, sch);
226 	if (cl) {
227 		if (cl->level == 0)
228 			return cl;
229 		/* Start with inner filter chain if a non-leaf class is selected */
230 		tcf = rcu_dereference_bh(cl->filter_list);
231 	} else {
232 		tcf = rcu_dereference_bh(q->filter_list);
233 	}
234 
235 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
236 	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
237 #ifdef CONFIG_NET_CLS_ACT
238 		switch (result) {
239 		case TC_ACT_QUEUED:
240 		case TC_ACT_STOLEN:
241 		case TC_ACT_TRAP:
242 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
243 		case TC_ACT_SHOT:
244 			return NULL;
245 		}
246 #endif
247 		cl = (void *)res.class;
248 		if (!cl) {
249 			if (res.classid == sch->handle)
250 				return HTB_DIRECT;	/* X:0 (direct flow) */
251 			cl = htb_find(res.classid, sch);
252 			if (!cl)
253 				break;	/* filter selected invalid classid */
254 		}
255 		if (!cl->level)
256 			return cl;	/* we hit leaf; return it */
257 
258 		/* we have got inner class; apply inner filter chain */
259 		tcf = rcu_dereference_bh(cl->filter_list);
260 	}
261 	/* classification failed; try to use default class */
262 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
263 	if (!cl || cl->level)
264 		return HTB_DIRECT;	/* bad default .. this is safe bet */
265 	return cl;
266 }
267 
268 /**
269  * htb_add_to_id_tree - adds class to the round robin list
270  *
271  * Routine adds class to the list (actually tree) sorted by classid.
272  * Make sure that class is not already on such list for given prio.
273  */
274 static void htb_add_to_id_tree(struct rb_root *root,
275 			       struct htb_class *cl, int prio)
276 {
277 	struct rb_node **p = &root->rb_node, *parent = NULL;
278 
279 	while (*p) {
280 		struct htb_class *c;
281 		parent = *p;
282 		c = rb_entry(parent, struct htb_class, node[prio]);
283 
284 		if (cl->common.classid > c->common.classid)
285 			p = &parent->rb_right;
286 		else
287 			p = &parent->rb_left;
288 	}
289 	rb_link_node(&cl->node[prio], parent, p);
290 	rb_insert_color(&cl->node[prio], root);
291 }
292 
293 /**
294  * htb_add_to_wait_tree - adds class to the event queue with delay
295  *
296  * The class is added to priority event queue to indicate that class will
297  * change its mode in cl->pq_key microseconds. Make sure that class is not
298  * already in the queue.
299  */
300 static void htb_add_to_wait_tree(struct htb_sched *q,
301 				 struct htb_class *cl, s64 delay)
302 {
303 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
304 
305 	cl->pq_key = q->now + delay;
306 	if (cl->pq_key == q->now)
307 		cl->pq_key++;
308 
309 	/* update the nearest event cache */
310 	if (q->near_ev_cache[cl->level] > cl->pq_key)
311 		q->near_ev_cache[cl->level] = cl->pq_key;
312 
313 	while (*p) {
314 		struct htb_class *c;
315 		parent = *p;
316 		c = rb_entry(parent, struct htb_class, pq_node);
317 		if (cl->pq_key >= c->pq_key)
318 			p = &parent->rb_right;
319 		else
320 			p = &parent->rb_left;
321 	}
322 	rb_link_node(&cl->pq_node, parent, p);
323 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
324 }
325 
326 /**
327  * htb_next_rb_node - finds next node in binary tree
328  *
329  * When we are past last key we return NULL.
330  * Average complexity is 2 steps per call.
331  */
332 static inline void htb_next_rb_node(struct rb_node **n)
333 {
334 	*n = rb_next(*n);
335 }
336 
337 /**
338  * htb_add_class_to_row - add class to its row
339  *
340  * The class is added to row at priorities marked in mask.
341  * It does nothing if mask == 0.
342  */
343 static inline void htb_add_class_to_row(struct htb_sched *q,
344 					struct htb_class *cl, int mask)
345 {
346 	q->row_mask[cl->level] |= mask;
347 	while (mask) {
348 		int prio = ffz(~mask);
349 		mask &= ~(1 << prio);
350 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
351 	}
352 }
353 
354 /* If this triggers, it is a bug in this code, but it need not be fatal */
355 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
356 {
357 	if (RB_EMPTY_NODE(rb)) {
358 		WARN_ON(1);
359 	} else {
360 		rb_erase(rb, root);
361 		RB_CLEAR_NODE(rb);
362 	}
363 }
364 
365 
366 /**
367  * htb_remove_class_from_row - removes class from its row
368  *
369  * The class is removed from row at priorities marked in mask.
370  * It does nothing if mask == 0.
371  */
372 static inline void htb_remove_class_from_row(struct htb_sched *q,
373 						 struct htb_class *cl, int mask)
374 {
375 	int m = 0;
376 	struct htb_level *hlevel = &q->hlevel[cl->level];
377 
378 	while (mask) {
379 		int prio = ffz(~mask);
380 		struct htb_prio *hprio = &hlevel->hprio[prio];
381 
382 		mask &= ~(1 << prio);
383 		if (hprio->ptr == cl->node + prio)
384 			htb_next_rb_node(&hprio->ptr);
385 
386 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
387 		if (!hprio->row.rb_node)
388 			m |= 1 << prio;
389 	}
390 	q->row_mask[cl->level] &= ~m;
391 }
392 
393 /**
394  * htb_activate_prios - creates active classe's feed chain
395  *
396  * The class is connected to ancestors and/or appropriate rows
397  * for priorities it is participating on. cl->cmode must be new
398  * (activated) mode. It does nothing if cl->prio_activity == 0.
399  */
400 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
401 {
402 	struct htb_class *p = cl->parent;
403 	long m, mask = cl->prio_activity;
404 
405 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
406 		m = mask;
407 		while (m) {
408 			int prio = ffz(~m);
409 			m &= ~(1 << prio);
410 
411 			if (p->un.inner.clprio[prio].feed.rb_node)
412 				/* parent already has its feed in use so that
413 				 * reset bit in mask as parent is already ok
414 				 */
415 				mask &= ~(1 << prio);
416 
417 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
418 		}
419 		p->prio_activity |= mask;
420 		cl = p;
421 		p = cl->parent;
422 
423 	}
424 	if (cl->cmode == HTB_CAN_SEND && mask)
425 		htb_add_class_to_row(q, cl, mask);
426 }
427 
428 /**
429  * htb_deactivate_prios - remove class from feed chain
430  *
431  * cl->cmode must represent old mode (before deactivation). It does
432  * nothing if cl->prio_activity == 0. Class is removed from all feed
433  * chains and rows.
434  */
435 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
436 {
437 	struct htb_class *p = cl->parent;
438 	long m, mask = cl->prio_activity;
439 
440 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
441 		m = mask;
442 		mask = 0;
443 		while (m) {
444 			int prio = ffz(~m);
445 			m &= ~(1 << prio);
446 
447 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
448 				/* we are removing child which is pointed to from
449 				 * parent feed - forget the pointer but remember
450 				 * classid
451 				 */
452 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
453 				p->un.inner.clprio[prio].ptr = NULL;
454 			}
455 
456 			htb_safe_rb_erase(cl->node + prio,
457 					  &p->un.inner.clprio[prio].feed);
458 
459 			if (!p->un.inner.clprio[prio].feed.rb_node)
460 				mask |= 1 << prio;
461 		}
462 
463 		p->prio_activity &= ~mask;
464 		cl = p;
465 		p = cl->parent;
466 
467 	}
468 	if (cl->cmode == HTB_CAN_SEND && mask)
469 		htb_remove_class_from_row(q, cl, mask);
470 }
471 
472 static inline s64 htb_lowater(const struct htb_class *cl)
473 {
474 	if (htb_hysteresis)
475 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
476 	else
477 		return 0;
478 }
479 static inline s64 htb_hiwater(const struct htb_class *cl)
480 {
481 	if (htb_hysteresis)
482 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
483 	else
484 		return 0;
485 }
486 
487 
488 /**
489  * htb_class_mode - computes and returns current class mode
490  *
491  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
492  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
493  * from now to time when cl will change its state.
494  * Also it is worth to note that class mode doesn't change simply
495  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
496  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
497  * mode transitions per time unit. The speed gain is about 1/6.
498  */
499 static inline enum htb_cmode
500 htb_class_mode(struct htb_class *cl, s64 *diff)
501 {
502 	s64 toks;
503 
504 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
505 		*diff = -toks;
506 		return HTB_CANT_SEND;
507 	}
508 
509 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
510 		return HTB_CAN_SEND;
511 
512 	*diff = -toks;
513 	return HTB_MAY_BORROW;
514 }
515 
516 /**
517  * htb_change_class_mode - changes classe's mode
518  *
519  * This should be the only way how to change classe's mode under normal
520  * cirsumstances. Routine will update feed lists linkage, change mode
521  * and add class to the wait event queue if appropriate. New mode should
522  * be different from old one and cl->pq_key has to be valid if changing
523  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
524  */
525 static void
526 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
527 {
528 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
529 
530 	if (new_mode == cl->cmode)
531 		return;
532 
533 	if (cl->prio_activity) {	/* not necessary: speed optimization */
534 		if (cl->cmode != HTB_CANT_SEND)
535 			htb_deactivate_prios(q, cl);
536 		cl->cmode = new_mode;
537 		if (new_mode != HTB_CANT_SEND)
538 			htb_activate_prios(q, cl);
539 	} else
540 		cl->cmode = new_mode;
541 }
542 
543 /**
544  * htb_activate - inserts leaf cl into appropriate active feeds
545  *
546  * Routine learns (new) priority of leaf and activates feed chain
547  * for the prio. It can be called on already active leaf safely.
548  * It also adds leaf into droplist.
549  */
550 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
551 {
552 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
553 
554 	if (!cl->prio_activity) {
555 		cl->prio_activity = 1 << cl->prio;
556 		htb_activate_prios(q, cl);
557 		list_add_tail(&cl->un.leaf.drop_list,
558 			      q->drops + cl->prio);
559 	}
560 }
561 
562 /**
563  * htb_deactivate - remove leaf cl from active feeds
564  *
565  * Make sure that leaf is active. In the other words it can't be called
566  * with non-active leaf. It also removes class from the drop list.
567  */
568 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
569 {
570 	WARN_ON(!cl->prio_activity);
571 
572 	htb_deactivate_prios(q, cl);
573 	cl->prio_activity = 0;
574 	list_del_init(&cl->un.leaf.drop_list);
575 }
576 
577 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
578 			     struct qdisc_skb_head *qh)
579 {
580 	struct sk_buff *last = qh->tail;
581 
582 	if (last) {
583 		skb->next = NULL;
584 		last->next = skb;
585 		qh->tail = skb;
586 	} else {
587 		qh->tail = skb;
588 		qh->head = skb;
589 	}
590 	qh->qlen++;
591 }
592 
593 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
594 		       struct sk_buff **to_free)
595 {
596 	int uninitialized_var(ret);
597 	struct htb_sched *q = qdisc_priv(sch);
598 	struct htb_class *cl = htb_classify(skb, sch, &ret);
599 
600 	if (cl == HTB_DIRECT) {
601 		/* enqueue to helper queue */
602 		if (q->direct_queue.qlen < q->direct_qlen) {
603 			htb_enqueue_tail(skb, sch, &q->direct_queue);
604 			q->direct_pkts++;
605 		} else {
606 			return qdisc_drop(skb, sch, to_free);
607 		}
608 #ifdef CONFIG_NET_CLS_ACT
609 	} else if (!cl) {
610 		if (ret & __NET_XMIT_BYPASS)
611 			qdisc_qstats_drop(sch);
612 		__qdisc_drop(skb, to_free);
613 		return ret;
614 #endif
615 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
616 					to_free)) != NET_XMIT_SUCCESS) {
617 		if (net_xmit_drop_count(ret)) {
618 			qdisc_qstats_drop(sch);
619 			cl->drops++;
620 		}
621 		return ret;
622 	} else {
623 		htb_activate(q, cl);
624 	}
625 
626 	qdisc_qstats_backlog_inc(sch, skb);
627 	sch->q.qlen++;
628 	return NET_XMIT_SUCCESS;
629 }
630 
631 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
632 {
633 	s64 toks = diff + cl->tokens;
634 
635 	if (toks > cl->buffer)
636 		toks = cl->buffer;
637 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
638 	if (toks <= -cl->mbuffer)
639 		toks = 1 - cl->mbuffer;
640 
641 	cl->tokens = toks;
642 }
643 
644 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
645 {
646 	s64 toks = diff + cl->ctokens;
647 
648 	if (toks > cl->cbuffer)
649 		toks = cl->cbuffer;
650 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
651 	if (toks <= -cl->mbuffer)
652 		toks = 1 - cl->mbuffer;
653 
654 	cl->ctokens = toks;
655 }
656 
657 /**
658  * htb_charge_class - charges amount "bytes" to leaf and ancestors
659  *
660  * Routine assumes that packet "bytes" long was dequeued from leaf cl
661  * borrowing from "level". It accounts bytes to ceil leaky bucket for
662  * leaf and all ancestors and to rate bucket for ancestors at levels
663  * "level" and higher. It also handles possible change of mode resulting
664  * from the update. Note that mode can also increase here (MAY_BORROW to
665  * CAN_SEND) because we can use more precise clock that event queue here.
666  * In such case we remove class from event queue first.
667  */
668 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
669 			     int level, struct sk_buff *skb)
670 {
671 	int bytes = qdisc_pkt_len(skb);
672 	enum htb_cmode old_mode;
673 	s64 diff;
674 
675 	while (cl) {
676 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
677 		if (cl->level >= level) {
678 			if (cl->level == level)
679 				cl->xstats.lends++;
680 			htb_accnt_tokens(cl, bytes, diff);
681 		} else {
682 			cl->xstats.borrows++;
683 			cl->tokens += diff;	/* we moved t_c; update tokens */
684 		}
685 		htb_accnt_ctokens(cl, bytes, diff);
686 		cl->t_c = q->now;
687 
688 		old_mode = cl->cmode;
689 		diff = 0;
690 		htb_change_class_mode(q, cl, &diff);
691 		if (old_mode != cl->cmode) {
692 			if (old_mode != HTB_CAN_SEND)
693 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
694 			if (cl->cmode != HTB_CAN_SEND)
695 				htb_add_to_wait_tree(q, cl, diff);
696 		}
697 
698 		/* update basic stats except for leaves which are already updated */
699 		if (cl->level)
700 			bstats_update(&cl->bstats, skb);
701 
702 		cl = cl->parent;
703 	}
704 }
705 
706 /**
707  * htb_do_events - make mode changes to classes at the level
708  *
709  * Scans event queue for pending events and applies them. Returns time of
710  * next pending event (0 for no event in pq, q->now for too many events).
711  * Note: Applied are events whose have cl->pq_key <= q->now.
712  */
713 static s64 htb_do_events(struct htb_sched *q, const int level,
714 			 unsigned long start)
715 {
716 	/* don't run for longer than 2 jiffies; 2 is used instead of
717 	 * 1 to simplify things when jiffy is going to be incremented
718 	 * too soon
719 	 */
720 	unsigned long stop_at = start + 2;
721 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
722 
723 	while (time_before(jiffies, stop_at)) {
724 		struct htb_class *cl;
725 		s64 diff;
726 		struct rb_node *p = rb_first(wait_pq);
727 
728 		if (!p)
729 			return 0;
730 
731 		cl = rb_entry(p, struct htb_class, pq_node);
732 		if (cl->pq_key > q->now)
733 			return cl->pq_key;
734 
735 		htb_safe_rb_erase(p, wait_pq);
736 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
737 		htb_change_class_mode(q, cl, &diff);
738 		if (cl->cmode != HTB_CAN_SEND)
739 			htb_add_to_wait_tree(q, cl, diff);
740 	}
741 
742 	/* too much load - let's continue after a break for scheduling */
743 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
744 		pr_warn("htb: too many events!\n");
745 		q->warned |= HTB_WARN_TOOMANYEVENTS;
746 	}
747 
748 	return q->now;
749 }
750 
751 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
752  * is no such one exists.
753  */
754 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
755 					      u32 id)
756 {
757 	struct rb_node *r = NULL;
758 	while (n) {
759 		struct htb_class *cl =
760 		    rb_entry(n, struct htb_class, node[prio]);
761 
762 		if (id > cl->common.classid) {
763 			n = n->rb_right;
764 		} else if (id < cl->common.classid) {
765 			r = n;
766 			n = n->rb_left;
767 		} else {
768 			return n;
769 		}
770 	}
771 	return r;
772 }
773 
774 /**
775  * htb_lookup_leaf - returns next leaf class in DRR order
776  *
777  * Find leaf where current feed pointers points to.
778  */
779 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
780 {
781 	int i;
782 	struct {
783 		struct rb_node *root;
784 		struct rb_node **pptr;
785 		u32 *pid;
786 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
787 
788 	BUG_ON(!hprio->row.rb_node);
789 	sp->root = hprio->row.rb_node;
790 	sp->pptr = &hprio->ptr;
791 	sp->pid = &hprio->last_ptr_id;
792 
793 	for (i = 0; i < 65535; i++) {
794 		if (!*sp->pptr && *sp->pid) {
795 			/* ptr was invalidated but id is valid - try to recover
796 			 * the original or next ptr
797 			 */
798 			*sp->pptr =
799 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
800 		}
801 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
802 				 * can become out of date quickly
803 				 */
804 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
805 			*sp->pptr = sp->root;
806 			while ((*sp->pptr)->rb_left)
807 				*sp->pptr = (*sp->pptr)->rb_left;
808 			if (sp > stk) {
809 				sp--;
810 				if (!*sp->pptr) {
811 					WARN_ON(1);
812 					return NULL;
813 				}
814 				htb_next_rb_node(sp->pptr);
815 			}
816 		} else {
817 			struct htb_class *cl;
818 			struct htb_prio *clp;
819 
820 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
821 			if (!cl->level)
822 				return cl;
823 			clp = &cl->un.inner.clprio[prio];
824 			(++sp)->root = clp->feed.rb_node;
825 			sp->pptr = &clp->ptr;
826 			sp->pid = &clp->last_ptr_id;
827 		}
828 	}
829 	WARN_ON(1);
830 	return NULL;
831 }
832 
833 /* dequeues packet at given priority and level; call only if
834  * you are sure that there is active class at prio/level
835  */
836 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
837 					const int level)
838 {
839 	struct sk_buff *skb = NULL;
840 	struct htb_class *cl, *start;
841 	struct htb_level *hlevel = &q->hlevel[level];
842 	struct htb_prio *hprio = &hlevel->hprio[prio];
843 
844 	/* look initial class up in the row */
845 	start = cl = htb_lookup_leaf(hprio, prio);
846 
847 	do {
848 next:
849 		if (unlikely(!cl))
850 			return NULL;
851 
852 		/* class can be empty - it is unlikely but can be true if leaf
853 		 * qdisc drops packets in enqueue routine or if someone used
854 		 * graft operation on the leaf since last dequeue;
855 		 * simply deactivate and skip such class
856 		 */
857 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
858 			struct htb_class *next;
859 			htb_deactivate(q, cl);
860 
861 			/* row/level might become empty */
862 			if ((q->row_mask[level] & (1 << prio)) == 0)
863 				return NULL;
864 
865 			next = htb_lookup_leaf(hprio, prio);
866 
867 			if (cl == start)	/* fix start if we just deleted it */
868 				start = next;
869 			cl = next;
870 			goto next;
871 		}
872 
873 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
874 		if (likely(skb != NULL))
875 			break;
876 
877 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
878 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
879 					 &q->hlevel[0].hprio[prio].ptr);
880 		cl = htb_lookup_leaf(hprio, prio);
881 
882 	} while (cl != start);
883 
884 	if (likely(skb != NULL)) {
885 		bstats_update(&cl->bstats, skb);
886 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
887 		if (cl->un.leaf.deficit[level] < 0) {
888 			cl->un.leaf.deficit[level] += cl->quantum;
889 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
890 						 &q->hlevel[0].hprio[prio].ptr);
891 		}
892 		/* this used to be after charge_class but this constelation
893 		 * gives us slightly better performance
894 		 */
895 		if (!cl->un.leaf.q->q.qlen)
896 			htb_deactivate(q, cl);
897 		htb_charge_class(q, cl, level, skb);
898 	}
899 	return skb;
900 }
901 
902 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
903 {
904 	struct sk_buff *skb;
905 	struct htb_sched *q = qdisc_priv(sch);
906 	int level;
907 	s64 next_event;
908 	unsigned long start_at;
909 
910 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
911 	skb = __qdisc_dequeue_head(&q->direct_queue);
912 	if (skb != NULL) {
913 ok:
914 		qdisc_bstats_update(sch, skb);
915 		qdisc_qstats_backlog_dec(sch, skb);
916 		sch->q.qlen--;
917 		return skb;
918 	}
919 
920 	if (!sch->q.qlen)
921 		goto fin;
922 	q->now = ktime_get_ns();
923 	start_at = jiffies;
924 
925 	next_event = q->now + 5LLU * NSEC_PER_SEC;
926 
927 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
928 		/* common case optimization - skip event handler quickly */
929 		int m;
930 		s64 event = q->near_ev_cache[level];
931 
932 		if (q->now >= event) {
933 			event = htb_do_events(q, level, start_at);
934 			if (!event)
935 				event = q->now + NSEC_PER_SEC;
936 			q->near_ev_cache[level] = event;
937 		}
938 
939 		if (next_event > event)
940 			next_event = event;
941 
942 		m = ~q->row_mask[level];
943 		while (m != (int)(-1)) {
944 			int prio = ffz(m);
945 
946 			m |= 1 << prio;
947 			skb = htb_dequeue_tree(q, prio, level);
948 			if (likely(skb != NULL))
949 				goto ok;
950 		}
951 	}
952 	qdisc_qstats_overlimit(sch);
953 	if (likely(next_event > q->now))
954 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
955 	else
956 		schedule_work(&q->work);
957 fin:
958 	return skb;
959 }
960 
961 /* reset all classes */
962 /* always caled under BH & queue lock */
963 static void htb_reset(struct Qdisc *sch)
964 {
965 	struct htb_sched *q = qdisc_priv(sch);
966 	struct htb_class *cl;
967 	unsigned int i;
968 
969 	for (i = 0; i < q->clhash.hashsize; i++) {
970 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
971 			if (cl->level)
972 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
973 			else {
974 				if (cl->un.leaf.q)
975 					qdisc_reset(cl->un.leaf.q);
976 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
977 			}
978 			cl->prio_activity = 0;
979 			cl->cmode = HTB_CAN_SEND;
980 		}
981 	}
982 	qdisc_watchdog_cancel(&q->watchdog);
983 	__qdisc_reset_queue(&q->direct_queue);
984 	sch->q.qlen = 0;
985 	sch->qstats.backlog = 0;
986 	memset(q->hlevel, 0, sizeof(q->hlevel));
987 	memset(q->row_mask, 0, sizeof(q->row_mask));
988 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
989 		INIT_LIST_HEAD(q->drops + i);
990 }
991 
992 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
993 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
994 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
995 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
996 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
997 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
998 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
999 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1000 };
1001 
1002 static void htb_work_func(struct work_struct *work)
1003 {
1004 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1005 	struct Qdisc *sch = q->watchdog.qdisc;
1006 
1007 	rcu_read_lock();
1008 	__netif_schedule(qdisc_root(sch));
1009 	rcu_read_unlock();
1010 }
1011 
1012 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1013 {
1014 	struct htb_sched *q = qdisc_priv(sch);
1015 	struct nlattr *tb[TCA_HTB_MAX + 1];
1016 	struct tc_htb_glob *gopt;
1017 	int err;
1018 	int i;
1019 
1020 	if (!opt)
1021 		return -EINVAL;
1022 
1023 	err = tcf_block_get(&q->block, &q->filter_list);
1024 	if (err)
1025 		return err;
1026 
1027 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1028 	if (err < 0)
1029 		return err;
1030 
1031 	if (!tb[TCA_HTB_INIT])
1032 		return -EINVAL;
1033 
1034 	gopt = nla_data(tb[TCA_HTB_INIT]);
1035 	if (gopt->version != HTB_VER >> 16)
1036 		return -EINVAL;
1037 
1038 	err = qdisc_class_hash_init(&q->clhash);
1039 	if (err < 0)
1040 		return err;
1041 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1042 		INIT_LIST_HEAD(q->drops + i);
1043 
1044 	qdisc_watchdog_init(&q->watchdog, sch);
1045 	INIT_WORK(&q->work, htb_work_func);
1046 	qdisc_skb_head_init(&q->direct_queue);
1047 
1048 	if (tb[TCA_HTB_DIRECT_QLEN])
1049 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1050 	else
1051 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1052 
1053 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1054 		q->rate2quantum = 1;
1055 	q->defcls = gopt->defcls;
1056 
1057 	return 0;
1058 }
1059 
1060 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1061 {
1062 	struct htb_sched *q = qdisc_priv(sch);
1063 	struct nlattr *nest;
1064 	struct tc_htb_glob gopt;
1065 
1066 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1067 	 * no change can happen on the qdisc parameters.
1068 	 */
1069 
1070 	gopt.direct_pkts = q->direct_pkts;
1071 	gopt.version = HTB_VER;
1072 	gopt.rate2quantum = q->rate2quantum;
1073 	gopt.defcls = q->defcls;
1074 	gopt.debug = 0;
1075 
1076 	nest = nla_nest_start(skb, TCA_OPTIONS);
1077 	if (nest == NULL)
1078 		goto nla_put_failure;
1079 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1080 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1081 		goto nla_put_failure;
1082 
1083 	return nla_nest_end(skb, nest);
1084 
1085 nla_put_failure:
1086 	nla_nest_cancel(skb, nest);
1087 	return -1;
1088 }
1089 
1090 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1091 			  struct sk_buff *skb, struct tcmsg *tcm)
1092 {
1093 	struct htb_class *cl = (struct htb_class *)arg;
1094 	struct nlattr *nest;
1095 	struct tc_htb_opt opt;
1096 
1097 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1098 	 * no change can happen on the class parameters.
1099 	 */
1100 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1101 	tcm->tcm_handle = cl->common.classid;
1102 	if (!cl->level && cl->un.leaf.q)
1103 		tcm->tcm_info = cl->un.leaf.q->handle;
1104 
1105 	nest = nla_nest_start(skb, TCA_OPTIONS);
1106 	if (nest == NULL)
1107 		goto nla_put_failure;
1108 
1109 	memset(&opt, 0, sizeof(opt));
1110 
1111 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1112 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1113 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1114 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1115 	opt.quantum = cl->quantum;
1116 	opt.prio = cl->prio;
1117 	opt.level = cl->level;
1118 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1119 		goto nla_put_failure;
1120 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1121 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1122 			      TCA_HTB_PAD))
1123 		goto nla_put_failure;
1124 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1125 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1126 			      TCA_HTB_PAD))
1127 		goto nla_put_failure;
1128 
1129 	return nla_nest_end(skb, nest);
1130 
1131 nla_put_failure:
1132 	nla_nest_cancel(skb, nest);
1133 	return -1;
1134 }
1135 
1136 static int
1137 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1138 {
1139 	struct htb_class *cl = (struct htb_class *)arg;
1140 	struct gnet_stats_queue qs = {
1141 		.drops = cl->drops,
1142 	};
1143 	__u32 qlen = 0;
1144 
1145 	if (!cl->level && cl->un.leaf.q) {
1146 		qlen = cl->un.leaf.q->q.qlen;
1147 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1148 	}
1149 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1150 				    INT_MIN, INT_MAX);
1151 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1152 				     INT_MIN, INT_MAX);
1153 
1154 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1155 				  d, NULL, &cl->bstats) < 0 ||
1156 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1157 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1158 		return -1;
1159 
1160 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1161 }
1162 
1163 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1164 		     struct Qdisc **old)
1165 {
1166 	struct htb_class *cl = (struct htb_class *)arg;
1167 
1168 	if (cl->level)
1169 		return -EINVAL;
1170 	if (new == NULL &&
1171 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1172 				     cl->common.classid)) == NULL)
1173 		return -ENOBUFS;
1174 
1175 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1176 	return 0;
1177 }
1178 
1179 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1180 {
1181 	struct htb_class *cl = (struct htb_class *)arg;
1182 	return !cl->level ? cl->un.leaf.q : NULL;
1183 }
1184 
1185 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1186 {
1187 	struct htb_class *cl = (struct htb_class *)arg;
1188 
1189 	if (cl->un.leaf.q->q.qlen == 0)
1190 		htb_deactivate(qdisc_priv(sch), cl);
1191 }
1192 
1193 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1194 {
1195 	struct htb_class *cl = htb_find(classid, sch);
1196 	if (cl)
1197 		cl->refcnt++;
1198 	return (unsigned long)cl;
1199 }
1200 
1201 static inline int htb_parent_last_child(struct htb_class *cl)
1202 {
1203 	if (!cl->parent)
1204 		/* the root class */
1205 		return 0;
1206 	if (cl->parent->children > 1)
1207 		/* not the last child */
1208 		return 0;
1209 	return 1;
1210 }
1211 
1212 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1213 			       struct Qdisc *new_q)
1214 {
1215 	struct htb_class *parent = cl->parent;
1216 
1217 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1218 
1219 	if (parent->cmode != HTB_CAN_SEND)
1220 		htb_safe_rb_erase(&parent->pq_node,
1221 				  &q->hlevel[parent->level].wait_pq);
1222 
1223 	parent->level = 0;
1224 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1225 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1226 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1227 	parent->tokens = parent->buffer;
1228 	parent->ctokens = parent->cbuffer;
1229 	parent->t_c = ktime_get_ns();
1230 	parent->cmode = HTB_CAN_SEND;
1231 }
1232 
1233 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1234 {
1235 	if (!cl->level) {
1236 		WARN_ON(!cl->un.leaf.q);
1237 		qdisc_destroy(cl->un.leaf.q);
1238 	}
1239 	gen_kill_estimator(&cl->rate_est);
1240 	tcf_block_put(cl->block);
1241 	kfree(cl);
1242 }
1243 
1244 static void htb_destroy(struct Qdisc *sch)
1245 {
1246 	struct htb_sched *q = qdisc_priv(sch);
1247 	struct hlist_node *next;
1248 	struct htb_class *cl;
1249 	unsigned int i;
1250 
1251 	cancel_work_sync(&q->work);
1252 	qdisc_watchdog_cancel(&q->watchdog);
1253 	/* This line used to be after htb_destroy_class call below
1254 	 * and surprisingly it worked in 2.4. But it must precede it
1255 	 * because filter need its target class alive to be able to call
1256 	 * unbind_filter on it (without Oops).
1257 	 */
1258 	tcf_block_put(q->block);
1259 
1260 	for (i = 0; i < q->clhash.hashsize; i++) {
1261 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
1262 			tcf_block_put(cl->block);
1263 	}
1264 	for (i = 0; i < q->clhash.hashsize; i++) {
1265 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1266 					  common.hnode)
1267 			htb_destroy_class(sch, cl);
1268 	}
1269 	qdisc_class_hash_destroy(&q->clhash);
1270 	__qdisc_reset_queue(&q->direct_queue);
1271 }
1272 
1273 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1274 {
1275 	struct htb_sched *q = qdisc_priv(sch);
1276 	struct htb_class *cl = (struct htb_class *)arg;
1277 	struct Qdisc *new_q = NULL;
1278 	int last_child = 0;
1279 
1280 	/* TODO: why don't allow to delete subtree ? references ? does
1281 	 * tc subsys guarantee us that in htb_destroy it holds no class
1282 	 * refs so that we can remove children safely there ?
1283 	 */
1284 	if (cl->children || cl->filter_cnt)
1285 		return -EBUSY;
1286 
1287 	if (!cl->level && htb_parent_last_child(cl)) {
1288 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1289 					  cl->parent->common.classid);
1290 		last_child = 1;
1291 	}
1292 
1293 	sch_tree_lock(sch);
1294 
1295 	if (!cl->level) {
1296 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1297 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1298 
1299 		qdisc_reset(cl->un.leaf.q);
1300 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1301 	}
1302 
1303 	/* delete from hash and active; remainder in destroy_class */
1304 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1305 	if (cl->parent)
1306 		cl->parent->children--;
1307 
1308 	if (cl->prio_activity)
1309 		htb_deactivate(q, cl);
1310 
1311 	if (cl->cmode != HTB_CAN_SEND)
1312 		htb_safe_rb_erase(&cl->pq_node,
1313 				  &q->hlevel[cl->level].wait_pq);
1314 
1315 	if (last_child)
1316 		htb_parent_to_leaf(q, cl, new_q);
1317 
1318 	BUG_ON(--cl->refcnt == 0);
1319 	/*
1320 	 * This shouldn't happen: we "hold" one cops->get() when called
1321 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1322 	 */
1323 
1324 	sch_tree_unlock(sch);
1325 	return 0;
1326 }
1327 
1328 static void htb_put(struct Qdisc *sch, unsigned long arg)
1329 {
1330 	struct htb_class *cl = (struct htb_class *)arg;
1331 
1332 	if (--cl->refcnt == 0)
1333 		htb_destroy_class(sch, cl);
1334 }
1335 
1336 static int htb_change_class(struct Qdisc *sch, u32 classid,
1337 			    u32 parentid, struct nlattr **tca,
1338 			    unsigned long *arg)
1339 {
1340 	int err = -EINVAL;
1341 	struct htb_sched *q = qdisc_priv(sch);
1342 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1343 	struct nlattr *opt = tca[TCA_OPTIONS];
1344 	struct nlattr *tb[TCA_HTB_MAX + 1];
1345 	struct tc_htb_opt *hopt;
1346 	u64 rate64, ceil64;
1347 
1348 	/* extract all subattrs from opt attr */
1349 	if (!opt)
1350 		goto failure;
1351 
1352 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1353 	if (err < 0)
1354 		goto failure;
1355 
1356 	err = -EINVAL;
1357 	if (tb[TCA_HTB_PARMS] == NULL)
1358 		goto failure;
1359 
1360 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1361 
1362 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1363 	if (!hopt->rate.rate || !hopt->ceil.rate)
1364 		goto failure;
1365 
1366 	/* Keeping backward compatible with rate_table based iproute2 tc */
1367 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1368 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1369 
1370 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1371 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1372 
1373 	if (!cl) {		/* new class */
1374 		struct Qdisc *new_q;
1375 		int prio;
1376 		struct {
1377 			struct nlattr		nla;
1378 			struct gnet_estimator	opt;
1379 		} est = {
1380 			.nla = {
1381 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1382 				.nla_type	= TCA_RATE,
1383 			},
1384 			.opt = {
1385 				/* 4s interval, 16s averaging constant */
1386 				.interval	= 2,
1387 				.ewma_log	= 2,
1388 			},
1389 		};
1390 
1391 		/* check for valid classid */
1392 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1393 		    htb_find(classid, sch))
1394 			goto failure;
1395 
1396 		/* check maximal depth */
1397 		if (parent && parent->parent && parent->parent->level < 2) {
1398 			pr_err("htb: tree is too deep\n");
1399 			goto failure;
1400 		}
1401 		err = -ENOBUFS;
1402 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1403 		if (!cl)
1404 			goto failure;
1405 
1406 		err = tcf_block_get(&cl->block, &cl->filter_list);
1407 		if (err) {
1408 			kfree(cl);
1409 			goto failure;
1410 		}
1411 		if (htb_rate_est || tca[TCA_RATE]) {
1412 			err = gen_new_estimator(&cl->bstats, NULL,
1413 						&cl->rate_est,
1414 						NULL,
1415 						qdisc_root_sleeping_running(sch),
1416 						tca[TCA_RATE] ? : &est.nla);
1417 			if (err) {
1418 				tcf_block_put(cl->block);
1419 				kfree(cl);
1420 				goto failure;
1421 			}
1422 		}
1423 
1424 		cl->refcnt = 1;
1425 		cl->children = 0;
1426 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1427 		RB_CLEAR_NODE(&cl->pq_node);
1428 
1429 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1430 			RB_CLEAR_NODE(&cl->node[prio]);
1431 
1432 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1433 		 * so that can't be used inside of sch_tree_lock
1434 		 * -- thanks to Karlis Peisenieks
1435 		 */
1436 		new_q = qdisc_create_dflt(sch->dev_queue,
1437 					  &pfifo_qdisc_ops, classid);
1438 		sch_tree_lock(sch);
1439 		if (parent && !parent->level) {
1440 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1441 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1442 
1443 			/* turn parent into inner node */
1444 			qdisc_reset(parent->un.leaf.q);
1445 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1446 			qdisc_destroy(parent->un.leaf.q);
1447 			if (parent->prio_activity)
1448 				htb_deactivate(q, parent);
1449 
1450 			/* remove from evt list because of level change */
1451 			if (parent->cmode != HTB_CAN_SEND) {
1452 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1453 				parent->cmode = HTB_CAN_SEND;
1454 			}
1455 			parent->level = (parent->parent ? parent->parent->level
1456 					 : TC_HTB_MAXDEPTH) - 1;
1457 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1458 		}
1459 		/* leaf (we) needs elementary qdisc */
1460 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1461 
1462 		cl->common.classid = classid;
1463 		cl->parent = parent;
1464 
1465 		/* set class to be in HTB_CAN_SEND state */
1466 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1467 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1468 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1469 		cl->t_c = ktime_get_ns();
1470 		cl->cmode = HTB_CAN_SEND;
1471 
1472 		/* attach to the hash list and parent's family */
1473 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1474 		if (parent)
1475 			parent->children++;
1476 		if (cl->un.leaf.q != &noop_qdisc)
1477 			qdisc_hash_add(cl->un.leaf.q, true);
1478 	} else {
1479 		if (tca[TCA_RATE]) {
1480 			err = gen_replace_estimator(&cl->bstats, NULL,
1481 						    &cl->rate_est,
1482 						    NULL,
1483 						    qdisc_root_sleeping_running(sch),
1484 						    tca[TCA_RATE]);
1485 			if (err)
1486 				return err;
1487 		}
1488 		sch_tree_lock(sch);
1489 	}
1490 
1491 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1492 
1493 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1494 
1495 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1496 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1497 
1498 	/* it used to be a nasty bug here, we have to check that node
1499 	 * is really leaf before changing cl->un.leaf !
1500 	 */
1501 	if (!cl->level) {
1502 		u64 quantum = cl->rate.rate_bytes_ps;
1503 
1504 		do_div(quantum, q->rate2quantum);
1505 		cl->quantum = min_t(u64, quantum, INT_MAX);
1506 
1507 		if (!hopt->quantum && cl->quantum < 1000) {
1508 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1509 				cl->common.classid);
1510 			cl->quantum = 1000;
1511 		}
1512 		if (!hopt->quantum && cl->quantum > 200000) {
1513 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1514 				cl->common.classid);
1515 			cl->quantum = 200000;
1516 		}
1517 		if (hopt->quantum)
1518 			cl->quantum = hopt->quantum;
1519 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1520 			cl->prio = TC_HTB_NUMPRIO - 1;
1521 	}
1522 
1523 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1524 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1525 
1526 	sch_tree_unlock(sch);
1527 
1528 	qdisc_class_hash_grow(sch, &q->clhash);
1529 
1530 	*arg = (unsigned long)cl;
1531 	return 0;
1532 
1533 failure:
1534 	return err;
1535 }
1536 
1537 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1538 {
1539 	struct htb_sched *q = qdisc_priv(sch);
1540 	struct htb_class *cl = (struct htb_class *)arg;
1541 
1542 	return cl ? cl->block : q->block;
1543 }
1544 
1545 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1546 				     u32 classid)
1547 {
1548 	struct htb_class *cl = htb_find(classid, sch);
1549 
1550 	/*if (cl && !cl->level) return 0;
1551 	 * The line above used to be there to prevent attaching filters to
1552 	 * leaves. But at least tc_index filter uses this just to get class
1553 	 * for other reasons so that we have to allow for it.
1554 	 * ----
1555 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1556 	 * another way to "lock" the class - unlike "get" this lock can
1557 	 * be broken by class during destroy IIUC.
1558 	 */
1559 	if (cl)
1560 		cl->filter_cnt++;
1561 	return (unsigned long)cl;
1562 }
1563 
1564 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1565 {
1566 	struct htb_class *cl = (struct htb_class *)arg;
1567 
1568 	if (cl)
1569 		cl->filter_cnt--;
1570 }
1571 
1572 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1573 {
1574 	struct htb_sched *q = qdisc_priv(sch);
1575 	struct htb_class *cl;
1576 	unsigned int i;
1577 
1578 	if (arg->stop)
1579 		return;
1580 
1581 	for (i = 0; i < q->clhash.hashsize; i++) {
1582 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1583 			if (arg->count < arg->skip) {
1584 				arg->count++;
1585 				continue;
1586 			}
1587 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1588 				arg->stop = 1;
1589 				return;
1590 			}
1591 			arg->count++;
1592 		}
1593 	}
1594 }
1595 
1596 static const struct Qdisc_class_ops htb_class_ops = {
1597 	.graft		=	htb_graft,
1598 	.leaf		=	htb_leaf,
1599 	.qlen_notify	=	htb_qlen_notify,
1600 	.get		=	htb_get,
1601 	.put		=	htb_put,
1602 	.change		=	htb_change_class,
1603 	.delete		=	htb_delete,
1604 	.walk		=	htb_walk,
1605 	.tcf_block	=	htb_tcf_block,
1606 	.bind_tcf	=	htb_bind_filter,
1607 	.unbind_tcf	=	htb_unbind_filter,
1608 	.dump		=	htb_dump_class,
1609 	.dump_stats	=	htb_dump_class_stats,
1610 };
1611 
1612 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1613 	.cl_ops		=	&htb_class_ops,
1614 	.id		=	"htb",
1615 	.priv_size	=	sizeof(struct htb_sched),
1616 	.enqueue	=	htb_enqueue,
1617 	.dequeue	=	htb_dequeue,
1618 	.peek		=	qdisc_peek_dequeued,
1619 	.init		=	htb_init,
1620 	.reset		=	htb_reset,
1621 	.destroy	=	htb_destroy,
1622 	.dump		=	htb_dump,
1623 	.owner		=	THIS_MODULE,
1624 };
1625 
1626 static int __init htb_module_init(void)
1627 {
1628 	return register_qdisc(&htb_qdisc_ops);
1629 }
1630 static void __exit htb_module_exit(void)
1631 {
1632 	unregister_qdisc(&htb_qdisc_ops);
1633 }
1634 
1635 module_init(htb_module_init)
1636 module_exit(htb_module_exit)
1637 MODULE_LICENSE("GPL");
1638