xref: /openbmc/linux/net/sched/sch_htb.c (revision 90a53e44)
1 /*
2  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Martin Devera, <devik@cdi.cz>
10  *
11  * Credits (in time order) for older HTB versions:
12  *              Stef Coene <stef.coene@docum.org>
13  *			HTB support at LARTC mailing list
14  *		Ondrej Kraus, <krauso@barr.cz>
15  *			found missing INIT_QDISC(htb)
16  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17  *			helped a lot to locate nasty class stall bug
18  *		Andi Kleen, Jamal Hadi, Bert Hubert
19  *			code review and helpful comments on shaping
20  *		Tomasz Wrona, <tw@eter.tym.pl>
21  *			created test case so that I was able to fix nasty bug
22  *		Wilfried Weissmann
23  *			spotted bug in dequeue code and helped with fix
24  *		Jiri Fojtasek
25  *			fixed requeue routine
26  *		and many others. thanks.
27  */
28 #include <linux/module.h>
29 #include <linux/moduleparam.h>
30 #include <linux/types.h>
31 #include <linux/kernel.h>
32 #include <linux/string.h>
33 #include <linux/errno.h>
34 #include <linux/skbuff.h>
35 #include <linux/list.h>
36 #include <linux/compiler.h>
37 #include <linux/rbtree.h>
38 #include <linux/workqueue.h>
39 #include <linux/slab.h>
40 #include <net/netlink.h>
41 #include <net/sch_generic.h>
42 #include <net/pkt_sched.h>
43 #include <net/pkt_cls.h>
44 
45 /* HTB algorithm.
46     Author: devik@cdi.cz
47     ========================================================================
48     HTB is like TBF with multiple classes. It is also similar to CBQ because
49     it allows to assign priority to each class in hierarchy.
50     In fact it is another implementation of Floyd's formal sharing.
51 
52     Levels:
53     Each class is assigned level. Leaf has ALWAYS level 0 and root
54     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
55     one less than their parent.
56 */
57 
58 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
59 #define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
60 
61 #if HTB_VER >> 16 != TC_HTB_PROTOVER
62 #error "Mismatched sch_htb.c and pkt_sch.h"
63 #endif
64 
65 /* Module parameter and sysfs export */
66 module_param    (htb_hysteresis, int, 0640);
67 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
68 
69 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
70 module_param(htb_rate_est, int, 0640);
71 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
72 
73 /* used internaly to keep status of single class */
74 enum htb_cmode {
75 	HTB_CANT_SEND,		/* class can't send and can't borrow */
76 	HTB_MAY_BORROW,		/* class can't send but may borrow */
77 	HTB_CAN_SEND		/* class can send */
78 };
79 
80 struct htb_prio {
81 	union {
82 		struct rb_root	row;
83 		struct rb_root	feed;
84 	};
85 	struct rb_node	*ptr;
86 	/* When class changes from state 1->2 and disconnects from
87 	 * parent's feed then we lost ptr value and start from the
88 	 * first child again. Here we store classid of the
89 	 * last valid ptr (used when ptr is NULL).
90 	 */
91 	u32		last_ptr_id;
92 };
93 
94 /* interior & leaf nodes; props specific to leaves are marked L:
95  * To reduce false sharing, place mostly read fields at beginning,
96  * and mostly written ones at the end.
97  */
98 struct htb_class {
99 	struct Qdisc_class_common common;
100 	struct psched_ratecfg	rate;
101 	struct psched_ratecfg	ceil;
102 	s64			buffer, cbuffer;/* token bucket depth/rate */
103 	s64			mbuffer;	/* max wait time */
104 	u32			prio;		/* these two are used only by leaves... */
105 	int			quantum;	/* but stored for parent-to-leaf return */
106 
107 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
108 	struct tcf_block	*block;
109 	int			filter_cnt;
110 
111 	int			level;		/* our level (see above) */
112 	unsigned int		children;
113 	struct htb_class	*parent;	/* parent class */
114 
115 	struct net_rate_estimator __rcu *rate_est;
116 
117 	/*
118 	 * Written often fields
119 	 */
120 	struct gnet_stats_basic_packed bstats;
121 	struct tc_htb_xstats	xstats;	/* our special stats */
122 
123 	/* token bucket parameters */
124 	s64			tokens, ctokens;/* current number of tokens */
125 	s64			t_c;		/* checkpoint time */
126 
127 	union {
128 		struct htb_class_leaf {
129 			struct list_head drop_list;
130 			int		deficit[TC_HTB_MAXDEPTH];
131 			struct Qdisc	*q;
132 		} leaf;
133 		struct htb_class_inner {
134 			struct htb_prio clprio[TC_HTB_NUMPRIO];
135 		} inner;
136 	} un;
137 	s64			pq_key;
138 
139 	int			prio_activity;	/* for which prios are we active */
140 	enum htb_cmode		cmode;		/* current mode of the class */
141 	struct rb_node		pq_node;	/* node for event queue */
142 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
143 
144 	unsigned int drops ____cacheline_aligned_in_smp;
145 	unsigned int		overlimits;
146 };
147 
148 struct htb_level {
149 	struct rb_root	wait_pq;
150 	struct htb_prio hprio[TC_HTB_NUMPRIO];
151 };
152 
153 struct htb_sched {
154 	struct Qdisc_class_hash clhash;
155 	int			defcls;		/* class where unclassified flows go to */
156 	int			rate2quantum;	/* quant = rate / rate2quantum */
157 
158 	/* filters for qdisc itself */
159 	struct tcf_proto __rcu	*filter_list;
160 	struct tcf_block	*block;
161 
162 #define HTB_WARN_TOOMANYEVENTS	0x1
163 	unsigned int		warned;	/* only one warning */
164 	int			direct_qlen;
165 	struct work_struct	work;
166 
167 	/* non shaped skbs; let them go directly thru */
168 	struct qdisc_skb_head	direct_queue;
169 	long			direct_pkts;
170 
171 	struct qdisc_watchdog	watchdog;
172 
173 	s64			now;	/* cached dequeue time */
174 	struct list_head	drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
175 
176 	/* time of nearest event per level (row) */
177 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
178 
179 	int			row_mask[TC_HTB_MAXDEPTH];
180 
181 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
182 };
183 
184 /* find class in global hash table using given handle */
185 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
186 {
187 	struct htb_sched *q = qdisc_priv(sch);
188 	struct Qdisc_class_common *clc;
189 
190 	clc = qdisc_class_find(&q->clhash, handle);
191 	if (clc == NULL)
192 		return NULL;
193 	return container_of(clc, struct htb_class, common);
194 }
195 
196 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
197 {
198 	return (unsigned long)htb_find(handle, sch);
199 }
200 /**
201  * htb_classify - classify a packet into class
202  *
203  * It returns NULL if the packet should be dropped or -1 if the packet
204  * should be passed directly thru. In all other cases leaf class is returned.
205  * We allow direct class selection by classid in priority. The we examine
206  * filters in qdisc and in inner nodes (if higher filter points to the inner
207  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
208  * internal fifo (direct). These packets then go directly thru. If we still
209  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
210  * then finish and return direct queue.
211  */
212 #define HTB_DIRECT ((struct htb_class *)-1L)
213 
214 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
215 				      int *qerr)
216 {
217 	struct htb_sched *q = qdisc_priv(sch);
218 	struct htb_class *cl;
219 	struct tcf_result res;
220 	struct tcf_proto *tcf;
221 	int result;
222 
223 	/* allow to select class by setting skb->priority to valid classid;
224 	 * note that nfmark can be used too by attaching filter fw with no
225 	 * rules in it
226 	 */
227 	if (skb->priority == sch->handle)
228 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
229 	cl = htb_find(skb->priority, sch);
230 	if (cl) {
231 		if (cl->level == 0)
232 			return cl;
233 		/* Start with inner filter chain if a non-leaf class is selected */
234 		tcf = rcu_dereference_bh(cl->filter_list);
235 	} else {
236 		tcf = rcu_dereference_bh(q->filter_list);
237 	}
238 
239 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
240 	while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) {
241 #ifdef CONFIG_NET_CLS_ACT
242 		switch (result) {
243 		case TC_ACT_QUEUED:
244 		case TC_ACT_STOLEN:
245 		case TC_ACT_TRAP:
246 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
247 		case TC_ACT_SHOT:
248 			return NULL;
249 		}
250 #endif
251 		cl = (void *)res.class;
252 		if (!cl) {
253 			if (res.classid == sch->handle)
254 				return HTB_DIRECT;	/* X:0 (direct flow) */
255 			cl = htb_find(res.classid, sch);
256 			if (!cl)
257 				break;	/* filter selected invalid classid */
258 		}
259 		if (!cl->level)
260 			return cl;	/* we hit leaf; return it */
261 
262 		/* we have got inner class; apply inner filter chain */
263 		tcf = rcu_dereference_bh(cl->filter_list);
264 	}
265 	/* classification failed; try to use default class */
266 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
267 	if (!cl || cl->level)
268 		return HTB_DIRECT;	/* bad default .. this is safe bet */
269 	return cl;
270 }
271 
272 /**
273  * htb_add_to_id_tree - adds class to the round robin list
274  *
275  * Routine adds class to the list (actually tree) sorted by classid.
276  * Make sure that class is not already on such list for given prio.
277  */
278 static void htb_add_to_id_tree(struct rb_root *root,
279 			       struct htb_class *cl, int prio)
280 {
281 	struct rb_node **p = &root->rb_node, *parent = NULL;
282 
283 	while (*p) {
284 		struct htb_class *c;
285 		parent = *p;
286 		c = rb_entry(parent, struct htb_class, node[prio]);
287 
288 		if (cl->common.classid > c->common.classid)
289 			p = &parent->rb_right;
290 		else
291 			p = &parent->rb_left;
292 	}
293 	rb_link_node(&cl->node[prio], parent, p);
294 	rb_insert_color(&cl->node[prio], root);
295 }
296 
297 /**
298  * htb_add_to_wait_tree - adds class to the event queue with delay
299  *
300  * The class is added to priority event queue to indicate that class will
301  * change its mode in cl->pq_key microseconds. Make sure that class is not
302  * already in the queue.
303  */
304 static void htb_add_to_wait_tree(struct htb_sched *q,
305 				 struct htb_class *cl, s64 delay)
306 {
307 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
308 
309 	cl->pq_key = q->now + delay;
310 	if (cl->pq_key == q->now)
311 		cl->pq_key++;
312 
313 	/* update the nearest event cache */
314 	if (q->near_ev_cache[cl->level] > cl->pq_key)
315 		q->near_ev_cache[cl->level] = cl->pq_key;
316 
317 	while (*p) {
318 		struct htb_class *c;
319 		parent = *p;
320 		c = rb_entry(parent, struct htb_class, pq_node);
321 		if (cl->pq_key >= c->pq_key)
322 			p = &parent->rb_right;
323 		else
324 			p = &parent->rb_left;
325 	}
326 	rb_link_node(&cl->pq_node, parent, p);
327 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
328 }
329 
330 /**
331  * htb_next_rb_node - finds next node in binary tree
332  *
333  * When we are past last key we return NULL.
334  * Average complexity is 2 steps per call.
335  */
336 static inline void htb_next_rb_node(struct rb_node **n)
337 {
338 	*n = rb_next(*n);
339 }
340 
341 /**
342  * htb_add_class_to_row - add class to its row
343  *
344  * The class is added to row at priorities marked in mask.
345  * It does nothing if mask == 0.
346  */
347 static inline void htb_add_class_to_row(struct htb_sched *q,
348 					struct htb_class *cl, int mask)
349 {
350 	q->row_mask[cl->level] |= mask;
351 	while (mask) {
352 		int prio = ffz(~mask);
353 		mask &= ~(1 << prio);
354 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
355 	}
356 }
357 
358 /* If this triggers, it is a bug in this code, but it need not be fatal */
359 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
360 {
361 	if (RB_EMPTY_NODE(rb)) {
362 		WARN_ON(1);
363 	} else {
364 		rb_erase(rb, root);
365 		RB_CLEAR_NODE(rb);
366 	}
367 }
368 
369 
370 /**
371  * htb_remove_class_from_row - removes class from its row
372  *
373  * The class is removed from row at priorities marked in mask.
374  * It does nothing if mask == 0.
375  */
376 static inline void htb_remove_class_from_row(struct htb_sched *q,
377 						 struct htb_class *cl, int mask)
378 {
379 	int m = 0;
380 	struct htb_level *hlevel = &q->hlevel[cl->level];
381 
382 	while (mask) {
383 		int prio = ffz(~mask);
384 		struct htb_prio *hprio = &hlevel->hprio[prio];
385 
386 		mask &= ~(1 << prio);
387 		if (hprio->ptr == cl->node + prio)
388 			htb_next_rb_node(&hprio->ptr);
389 
390 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
391 		if (!hprio->row.rb_node)
392 			m |= 1 << prio;
393 	}
394 	q->row_mask[cl->level] &= ~m;
395 }
396 
397 /**
398  * htb_activate_prios - creates active classe's feed chain
399  *
400  * The class is connected to ancestors and/or appropriate rows
401  * for priorities it is participating on. cl->cmode must be new
402  * (activated) mode. It does nothing if cl->prio_activity == 0.
403  */
404 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
405 {
406 	struct htb_class *p = cl->parent;
407 	long m, mask = cl->prio_activity;
408 
409 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
410 		m = mask;
411 		while (m) {
412 			int prio = ffz(~m);
413 			m &= ~(1 << prio);
414 
415 			if (p->un.inner.clprio[prio].feed.rb_node)
416 				/* parent already has its feed in use so that
417 				 * reset bit in mask as parent is already ok
418 				 */
419 				mask &= ~(1 << prio);
420 
421 			htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
422 		}
423 		p->prio_activity |= mask;
424 		cl = p;
425 		p = cl->parent;
426 
427 	}
428 	if (cl->cmode == HTB_CAN_SEND && mask)
429 		htb_add_class_to_row(q, cl, mask);
430 }
431 
432 /**
433  * htb_deactivate_prios - remove class from feed chain
434  *
435  * cl->cmode must represent old mode (before deactivation). It does
436  * nothing if cl->prio_activity == 0. Class is removed from all feed
437  * chains and rows.
438  */
439 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
440 {
441 	struct htb_class *p = cl->parent;
442 	long m, mask = cl->prio_activity;
443 
444 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
445 		m = mask;
446 		mask = 0;
447 		while (m) {
448 			int prio = ffz(~m);
449 			m &= ~(1 << prio);
450 
451 			if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
452 				/* we are removing child which is pointed to from
453 				 * parent feed - forget the pointer but remember
454 				 * classid
455 				 */
456 				p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
457 				p->un.inner.clprio[prio].ptr = NULL;
458 			}
459 
460 			htb_safe_rb_erase(cl->node + prio,
461 					  &p->un.inner.clprio[prio].feed);
462 
463 			if (!p->un.inner.clprio[prio].feed.rb_node)
464 				mask |= 1 << prio;
465 		}
466 
467 		p->prio_activity &= ~mask;
468 		cl = p;
469 		p = cl->parent;
470 
471 	}
472 	if (cl->cmode == HTB_CAN_SEND && mask)
473 		htb_remove_class_from_row(q, cl, mask);
474 }
475 
476 static inline s64 htb_lowater(const struct htb_class *cl)
477 {
478 	if (htb_hysteresis)
479 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
480 	else
481 		return 0;
482 }
483 static inline s64 htb_hiwater(const struct htb_class *cl)
484 {
485 	if (htb_hysteresis)
486 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
487 	else
488 		return 0;
489 }
490 
491 
492 /**
493  * htb_class_mode - computes and returns current class mode
494  *
495  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
496  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
497  * from now to time when cl will change its state.
498  * Also it is worth to note that class mode doesn't change simply
499  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
500  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
501  * mode transitions per time unit. The speed gain is about 1/6.
502  */
503 static inline enum htb_cmode
504 htb_class_mode(struct htb_class *cl, s64 *diff)
505 {
506 	s64 toks;
507 
508 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
509 		*diff = -toks;
510 		return HTB_CANT_SEND;
511 	}
512 
513 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
514 		return HTB_CAN_SEND;
515 
516 	*diff = -toks;
517 	return HTB_MAY_BORROW;
518 }
519 
520 /**
521  * htb_change_class_mode - changes classe's mode
522  *
523  * This should be the only way how to change classe's mode under normal
524  * cirsumstances. Routine will update feed lists linkage, change mode
525  * and add class to the wait event queue if appropriate. New mode should
526  * be different from old one and cl->pq_key has to be valid if changing
527  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
528  */
529 static void
530 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
531 {
532 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
533 
534 	if (new_mode == cl->cmode)
535 		return;
536 
537 	if (new_mode == HTB_CANT_SEND)
538 		cl->overlimits++;
539 
540 	if (cl->prio_activity) {	/* not necessary: speed optimization */
541 		if (cl->cmode != HTB_CANT_SEND)
542 			htb_deactivate_prios(q, cl);
543 		cl->cmode = new_mode;
544 		if (new_mode != HTB_CANT_SEND)
545 			htb_activate_prios(q, cl);
546 	} else
547 		cl->cmode = new_mode;
548 }
549 
550 /**
551  * htb_activate - inserts leaf cl into appropriate active feeds
552  *
553  * Routine learns (new) priority of leaf and activates feed chain
554  * for the prio. It can be called on already active leaf safely.
555  * It also adds leaf into droplist.
556  */
557 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
558 {
559 	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
560 
561 	if (!cl->prio_activity) {
562 		cl->prio_activity = 1 << cl->prio;
563 		htb_activate_prios(q, cl);
564 		list_add_tail(&cl->un.leaf.drop_list,
565 			      q->drops + cl->prio);
566 	}
567 }
568 
569 /**
570  * htb_deactivate - remove leaf cl from active feeds
571  *
572  * Make sure that leaf is active. In the other words it can't be called
573  * with non-active leaf. It also removes class from the drop list.
574  */
575 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
576 {
577 	WARN_ON(!cl->prio_activity);
578 
579 	htb_deactivate_prios(q, cl);
580 	cl->prio_activity = 0;
581 	list_del_init(&cl->un.leaf.drop_list);
582 }
583 
584 static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
585 			     struct qdisc_skb_head *qh)
586 {
587 	struct sk_buff *last = qh->tail;
588 
589 	if (last) {
590 		skb->next = NULL;
591 		last->next = skb;
592 		qh->tail = skb;
593 	} else {
594 		qh->tail = skb;
595 		qh->head = skb;
596 	}
597 	qh->qlen++;
598 }
599 
600 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
601 		       struct sk_buff **to_free)
602 {
603 	int uninitialized_var(ret);
604 	struct htb_sched *q = qdisc_priv(sch);
605 	struct htb_class *cl = htb_classify(skb, sch, &ret);
606 
607 	if (cl == HTB_DIRECT) {
608 		/* enqueue to helper queue */
609 		if (q->direct_queue.qlen < q->direct_qlen) {
610 			htb_enqueue_tail(skb, sch, &q->direct_queue);
611 			q->direct_pkts++;
612 		} else {
613 			return qdisc_drop(skb, sch, to_free);
614 		}
615 #ifdef CONFIG_NET_CLS_ACT
616 	} else if (!cl) {
617 		if (ret & __NET_XMIT_BYPASS)
618 			qdisc_qstats_drop(sch);
619 		__qdisc_drop(skb, to_free);
620 		return ret;
621 #endif
622 	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
623 					to_free)) != NET_XMIT_SUCCESS) {
624 		if (net_xmit_drop_count(ret)) {
625 			qdisc_qstats_drop(sch);
626 			cl->drops++;
627 		}
628 		return ret;
629 	} else {
630 		htb_activate(q, cl);
631 	}
632 
633 	qdisc_qstats_backlog_inc(sch, skb);
634 	sch->q.qlen++;
635 	return NET_XMIT_SUCCESS;
636 }
637 
638 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
639 {
640 	s64 toks = diff + cl->tokens;
641 
642 	if (toks > cl->buffer)
643 		toks = cl->buffer;
644 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
645 	if (toks <= -cl->mbuffer)
646 		toks = 1 - cl->mbuffer;
647 
648 	cl->tokens = toks;
649 }
650 
651 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
652 {
653 	s64 toks = diff + cl->ctokens;
654 
655 	if (toks > cl->cbuffer)
656 		toks = cl->cbuffer;
657 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
658 	if (toks <= -cl->mbuffer)
659 		toks = 1 - cl->mbuffer;
660 
661 	cl->ctokens = toks;
662 }
663 
664 /**
665  * htb_charge_class - charges amount "bytes" to leaf and ancestors
666  *
667  * Routine assumes that packet "bytes" long was dequeued from leaf cl
668  * borrowing from "level". It accounts bytes to ceil leaky bucket for
669  * leaf and all ancestors and to rate bucket for ancestors at levels
670  * "level" and higher. It also handles possible change of mode resulting
671  * from the update. Note that mode can also increase here (MAY_BORROW to
672  * CAN_SEND) because we can use more precise clock that event queue here.
673  * In such case we remove class from event queue first.
674  */
675 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
676 			     int level, struct sk_buff *skb)
677 {
678 	int bytes = qdisc_pkt_len(skb);
679 	enum htb_cmode old_mode;
680 	s64 diff;
681 
682 	while (cl) {
683 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
684 		if (cl->level >= level) {
685 			if (cl->level == level)
686 				cl->xstats.lends++;
687 			htb_accnt_tokens(cl, bytes, diff);
688 		} else {
689 			cl->xstats.borrows++;
690 			cl->tokens += diff;	/* we moved t_c; update tokens */
691 		}
692 		htb_accnt_ctokens(cl, bytes, diff);
693 		cl->t_c = q->now;
694 
695 		old_mode = cl->cmode;
696 		diff = 0;
697 		htb_change_class_mode(q, cl, &diff);
698 		if (old_mode != cl->cmode) {
699 			if (old_mode != HTB_CAN_SEND)
700 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
701 			if (cl->cmode != HTB_CAN_SEND)
702 				htb_add_to_wait_tree(q, cl, diff);
703 		}
704 
705 		/* update basic stats except for leaves which are already updated */
706 		if (cl->level)
707 			bstats_update(&cl->bstats, skb);
708 
709 		cl = cl->parent;
710 	}
711 }
712 
713 /**
714  * htb_do_events - make mode changes to classes at the level
715  *
716  * Scans event queue for pending events and applies them. Returns time of
717  * next pending event (0 for no event in pq, q->now for too many events).
718  * Note: Applied are events whose have cl->pq_key <= q->now.
719  */
720 static s64 htb_do_events(struct htb_sched *q, const int level,
721 			 unsigned long start)
722 {
723 	/* don't run for longer than 2 jiffies; 2 is used instead of
724 	 * 1 to simplify things when jiffy is going to be incremented
725 	 * too soon
726 	 */
727 	unsigned long stop_at = start + 2;
728 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
729 
730 	while (time_before(jiffies, stop_at)) {
731 		struct htb_class *cl;
732 		s64 diff;
733 		struct rb_node *p = rb_first(wait_pq);
734 
735 		if (!p)
736 			return 0;
737 
738 		cl = rb_entry(p, struct htb_class, pq_node);
739 		if (cl->pq_key > q->now)
740 			return cl->pq_key;
741 
742 		htb_safe_rb_erase(p, wait_pq);
743 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
744 		htb_change_class_mode(q, cl, &diff);
745 		if (cl->cmode != HTB_CAN_SEND)
746 			htb_add_to_wait_tree(q, cl, diff);
747 	}
748 
749 	/* too much load - let's continue after a break for scheduling */
750 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
751 		pr_warn("htb: too many events!\n");
752 		q->warned |= HTB_WARN_TOOMANYEVENTS;
753 	}
754 
755 	return q->now;
756 }
757 
758 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
759  * is no such one exists.
760  */
761 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
762 					      u32 id)
763 {
764 	struct rb_node *r = NULL;
765 	while (n) {
766 		struct htb_class *cl =
767 		    rb_entry(n, struct htb_class, node[prio]);
768 
769 		if (id > cl->common.classid) {
770 			n = n->rb_right;
771 		} else if (id < cl->common.classid) {
772 			r = n;
773 			n = n->rb_left;
774 		} else {
775 			return n;
776 		}
777 	}
778 	return r;
779 }
780 
781 /**
782  * htb_lookup_leaf - returns next leaf class in DRR order
783  *
784  * Find leaf where current feed pointers points to.
785  */
786 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
787 {
788 	int i;
789 	struct {
790 		struct rb_node *root;
791 		struct rb_node **pptr;
792 		u32 *pid;
793 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
794 
795 	BUG_ON(!hprio->row.rb_node);
796 	sp->root = hprio->row.rb_node;
797 	sp->pptr = &hprio->ptr;
798 	sp->pid = &hprio->last_ptr_id;
799 
800 	for (i = 0; i < 65535; i++) {
801 		if (!*sp->pptr && *sp->pid) {
802 			/* ptr was invalidated but id is valid - try to recover
803 			 * the original or next ptr
804 			 */
805 			*sp->pptr =
806 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
807 		}
808 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
809 				 * can become out of date quickly
810 				 */
811 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
812 			*sp->pptr = sp->root;
813 			while ((*sp->pptr)->rb_left)
814 				*sp->pptr = (*sp->pptr)->rb_left;
815 			if (sp > stk) {
816 				sp--;
817 				if (!*sp->pptr) {
818 					WARN_ON(1);
819 					return NULL;
820 				}
821 				htb_next_rb_node(sp->pptr);
822 			}
823 		} else {
824 			struct htb_class *cl;
825 			struct htb_prio *clp;
826 
827 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
828 			if (!cl->level)
829 				return cl;
830 			clp = &cl->un.inner.clprio[prio];
831 			(++sp)->root = clp->feed.rb_node;
832 			sp->pptr = &clp->ptr;
833 			sp->pid = &clp->last_ptr_id;
834 		}
835 	}
836 	WARN_ON(1);
837 	return NULL;
838 }
839 
840 /* dequeues packet at given priority and level; call only if
841  * you are sure that there is active class at prio/level
842  */
843 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
844 					const int level)
845 {
846 	struct sk_buff *skb = NULL;
847 	struct htb_class *cl, *start;
848 	struct htb_level *hlevel = &q->hlevel[level];
849 	struct htb_prio *hprio = &hlevel->hprio[prio];
850 
851 	/* look initial class up in the row */
852 	start = cl = htb_lookup_leaf(hprio, prio);
853 
854 	do {
855 next:
856 		if (unlikely(!cl))
857 			return NULL;
858 
859 		/* class can be empty - it is unlikely but can be true if leaf
860 		 * qdisc drops packets in enqueue routine or if someone used
861 		 * graft operation on the leaf since last dequeue;
862 		 * simply deactivate and skip such class
863 		 */
864 		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
865 			struct htb_class *next;
866 			htb_deactivate(q, cl);
867 
868 			/* row/level might become empty */
869 			if ((q->row_mask[level] & (1 << prio)) == 0)
870 				return NULL;
871 
872 			next = htb_lookup_leaf(hprio, prio);
873 
874 			if (cl == start)	/* fix start if we just deleted it */
875 				start = next;
876 			cl = next;
877 			goto next;
878 		}
879 
880 		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
881 		if (likely(skb != NULL))
882 			break;
883 
884 		qdisc_warn_nonwc("htb", cl->un.leaf.q);
885 		htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
886 					 &q->hlevel[0].hprio[prio].ptr);
887 		cl = htb_lookup_leaf(hprio, prio);
888 
889 	} while (cl != start);
890 
891 	if (likely(skb != NULL)) {
892 		bstats_update(&cl->bstats, skb);
893 		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
894 		if (cl->un.leaf.deficit[level] < 0) {
895 			cl->un.leaf.deficit[level] += cl->quantum;
896 			htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
897 						 &q->hlevel[0].hprio[prio].ptr);
898 		}
899 		/* this used to be after charge_class but this constelation
900 		 * gives us slightly better performance
901 		 */
902 		if (!cl->un.leaf.q->q.qlen)
903 			htb_deactivate(q, cl);
904 		htb_charge_class(q, cl, level, skb);
905 	}
906 	return skb;
907 }
908 
909 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
910 {
911 	struct sk_buff *skb;
912 	struct htb_sched *q = qdisc_priv(sch);
913 	int level;
914 	s64 next_event;
915 	unsigned long start_at;
916 
917 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
918 	skb = __qdisc_dequeue_head(&q->direct_queue);
919 	if (skb != NULL) {
920 ok:
921 		qdisc_bstats_update(sch, skb);
922 		qdisc_qstats_backlog_dec(sch, skb);
923 		sch->q.qlen--;
924 		return skb;
925 	}
926 
927 	if (!sch->q.qlen)
928 		goto fin;
929 	q->now = ktime_get_ns();
930 	start_at = jiffies;
931 
932 	next_event = q->now + 5LLU * NSEC_PER_SEC;
933 
934 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
935 		/* common case optimization - skip event handler quickly */
936 		int m;
937 		s64 event = q->near_ev_cache[level];
938 
939 		if (q->now >= event) {
940 			event = htb_do_events(q, level, start_at);
941 			if (!event)
942 				event = q->now + NSEC_PER_SEC;
943 			q->near_ev_cache[level] = event;
944 		}
945 
946 		if (next_event > event)
947 			next_event = event;
948 
949 		m = ~q->row_mask[level];
950 		while (m != (int)(-1)) {
951 			int prio = ffz(m);
952 
953 			m |= 1 << prio;
954 			skb = htb_dequeue_tree(q, prio, level);
955 			if (likely(skb != NULL))
956 				goto ok;
957 		}
958 	}
959 	qdisc_qstats_overlimit(sch);
960 	if (likely(next_event > q->now))
961 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
962 	else
963 		schedule_work(&q->work);
964 fin:
965 	return skb;
966 }
967 
968 /* reset all classes */
969 /* always caled under BH & queue lock */
970 static void htb_reset(struct Qdisc *sch)
971 {
972 	struct htb_sched *q = qdisc_priv(sch);
973 	struct htb_class *cl;
974 	unsigned int i;
975 
976 	for (i = 0; i < q->clhash.hashsize; i++) {
977 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
978 			if (cl->level)
979 				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
980 			else {
981 				if (cl->un.leaf.q)
982 					qdisc_reset(cl->un.leaf.q);
983 				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
984 			}
985 			cl->prio_activity = 0;
986 			cl->cmode = HTB_CAN_SEND;
987 		}
988 	}
989 	qdisc_watchdog_cancel(&q->watchdog);
990 	__qdisc_reset_queue(&q->direct_queue);
991 	sch->q.qlen = 0;
992 	sch->qstats.backlog = 0;
993 	memset(q->hlevel, 0, sizeof(q->hlevel));
994 	memset(q->row_mask, 0, sizeof(q->row_mask));
995 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
996 		INIT_LIST_HEAD(q->drops + i);
997 }
998 
999 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1000 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1001 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1002 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1003 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1004 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1005 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1006 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1007 };
1008 
1009 static void htb_work_func(struct work_struct *work)
1010 {
1011 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1012 	struct Qdisc *sch = q->watchdog.qdisc;
1013 
1014 	rcu_read_lock();
1015 	__netif_schedule(qdisc_root(sch));
1016 	rcu_read_unlock();
1017 }
1018 
1019 static int htb_init(struct Qdisc *sch, struct nlattr *opt)
1020 {
1021 	struct htb_sched *q = qdisc_priv(sch);
1022 	struct nlattr *tb[TCA_HTB_MAX + 1];
1023 	struct tc_htb_glob *gopt;
1024 	int err;
1025 	int i;
1026 
1027 	qdisc_watchdog_init(&q->watchdog, sch);
1028 	INIT_WORK(&q->work, htb_work_func);
1029 
1030 	if (!opt)
1031 		return -EINVAL;
1032 
1033 	err = tcf_block_get(&q->block, &q->filter_list);
1034 	if (err)
1035 		return err;
1036 
1037 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1038 	if (err < 0)
1039 		return err;
1040 
1041 	if (!tb[TCA_HTB_INIT])
1042 		return -EINVAL;
1043 
1044 	gopt = nla_data(tb[TCA_HTB_INIT]);
1045 	if (gopt->version != HTB_VER >> 16)
1046 		return -EINVAL;
1047 
1048 	err = qdisc_class_hash_init(&q->clhash);
1049 	if (err < 0)
1050 		return err;
1051 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1052 		INIT_LIST_HEAD(q->drops + i);
1053 
1054 	qdisc_skb_head_init(&q->direct_queue);
1055 
1056 	if (tb[TCA_HTB_DIRECT_QLEN])
1057 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1058 	else
1059 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1060 
1061 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1062 		q->rate2quantum = 1;
1063 	q->defcls = gopt->defcls;
1064 
1065 	return 0;
1066 }
1067 
1068 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1069 {
1070 	struct htb_sched *q = qdisc_priv(sch);
1071 	struct nlattr *nest;
1072 	struct tc_htb_glob gopt;
1073 
1074 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1075 	 * no change can happen on the qdisc parameters.
1076 	 */
1077 
1078 	gopt.direct_pkts = q->direct_pkts;
1079 	gopt.version = HTB_VER;
1080 	gopt.rate2quantum = q->rate2quantum;
1081 	gopt.defcls = q->defcls;
1082 	gopt.debug = 0;
1083 
1084 	nest = nla_nest_start(skb, TCA_OPTIONS);
1085 	if (nest == NULL)
1086 		goto nla_put_failure;
1087 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1088 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1089 		goto nla_put_failure;
1090 
1091 	return nla_nest_end(skb, nest);
1092 
1093 nla_put_failure:
1094 	nla_nest_cancel(skb, nest);
1095 	return -1;
1096 }
1097 
1098 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1099 			  struct sk_buff *skb, struct tcmsg *tcm)
1100 {
1101 	struct htb_class *cl = (struct htb_class *)arg;
1102 	struct nlattr *nest;
1103 	struct tc_htb_opt opt;
1104 
1105 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1106 	 * no change can happen on the class parameters.
1107 	 */
1108 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1109 	tcm->tcm_handle = cl->common.classid;
1110 	if (!cl->level && cl->un.leaf.q)
1111 		tcm->tcm_info = cl->un.leaf.q->handle;
1112 
1113 	nest = nla_nest_start(skb, TCA_OPTIONS);
1114 	if (nest == NULL)
1115 		goto nla_put_failure;
1116 
1117 	memset(&opt, 0, sizeof(opt));
1118 
1119 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1120 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1121 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1122 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1123 	opt.quantum = cl->quantum;
1124 	opt.prio = cl->prio;
1125 	opt.level = cl->level;
1126 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1127 		goto nla_put_failure;
1128 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1129 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1130 			      TCA_HTB_PAD))
1131 		goto nla_put_failure;
1132 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1133 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1134 			      TCA_HTB_PAD))
1135 		goto nla_put_failure;
1136 
1137 	return nla_nest_end(skb, nest);
1138 
1139 nla_put_failure:
1140 	nla_nest_cancel(skb, nest);
1141 	return -1;
1142 }
1143 
1144 static int
1145 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1146 {
1147 	struct htb_class *cl = (struct htb_class *)arg;
1148 	struct gnet_stats_queue qs = {
1149 		.drops = cl->drops,
1150 		.overlimits = cl->overlimits,
1151 	};
1152 	__u32 qlen = 0;
1153 
1154 	if (!cl->level && cl->un.leaf.q) {
1155 		qlen = cl->un.leaf.q->q.qlen;
1156 		qs.backlog = cl->un.leaf.q->qstats.backlog;
1157 	}
1158 	cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
1159 				    INT_MIN, INT_MAX);
1160 	cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens),
1161 				     INT_MIN, INT_MAX);
1162 
1163 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1164 				  d, NULL, &cl->bstats) < 0 ||
1165 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1166 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1167 		return -1;
1168 
1169 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1170 }
1171 
1172 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1173 		     struct Qdisc **old)
1174 {
1175 	struct htb_class *cl = (struct htb_class *)arg;
1176 
1177 	if (cl->level)
1178 		return -EINVAL;
1179 	if (new == NULL &&
1180 	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1181 				     cl->common.classid)) == NULL)
1182 		return -ENOBUFS;
1183 
1184 	*old = qdisc_replace(sch, new, &cl->un.leaf.q);
1185 	return 0;
1186 }
1187 
1188 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1189 {
1190 	struct htb_class *cl = (struct htb_class *)arg;
1191 	return !cl->level ? cl->un.leaf.q : NULL;
1192 }
1193 
1194 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1195 {
1196 	struct htb_class *cl = (struct htb_class *)arg;
1197 
1198 	htb_deactivate(qdisc_priv(sch), cl);
1199 }
1200 
1201 static inline int htb_parent_last_child(struct htb_class *cl)
1202 {
1203 	if (!cl->parent)
1204 		/* the root class */
1205 		return 0;
1206 	if (cl->parent->children > 1)
1207 		/* not the last child */
1208 		return 0;
1209 	return 1;
1210 }
1211 
1212 static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1213 			       struct Qdisc *new_q)
1214 {
1215 	struct htb_class *parent = cl->parent;
1216 
1217 	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1218 
1219 	if (parent->cmode != HTB_CAN_SEND)
1220 		htb_safe_rb_erase(&parent->pq_node,
1221 				  &q->hlevel[parent->level].wait_pq);
1222 
1223 	parent->level = 0;
1224 	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1225 	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1226 	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1227 	parent->tokens = parent->buffer;
1228 	parent->ctokens = parent->cbuffer;
1229 	parent->t_c = ktime_get_ns();
1230 	parent->cmode = HTB_CAN_SEND;
1231 }
1232 
1233 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1234 {
1235 	if (!cl->level) {
1236 		WARN_ON(!cl->un.leaf.q);
1237 		qdisc_destroy(cl->un.leaf.q);
1238 	}
1239 	gen_kill_estimator(&cl->rate_est);
1240 	tcf_block_put(cl->block);
1241 	kfree(cl);
1242 }
1243 
1244 static void htb_destroy(struct Qdisc *sch)
1245 {
1246 	struct htb_sched *q = qdisc_priv(sch);
1247 	struct hlist_node *next;
1248 	struct htb_class *cl;
1249 	unsigned int i;
1250 
1251 	cancel_work_sync(&q->work);
1252 	qdisc_watchdog_cancel(&q->watchdog);
1253 	/* This line used to be after htb_destroy_class call below
1254 	 * and surprisingly it worked in 2.4. But it must precede it
1255 	 * because filter need its target class alive to be able to call
1256 	 * unbind_filter on it (without Oops).
1257 	 */
1258 	tcf_block_put(q->block);
1259 
1260 	for (i = 0; i < q->clhash.hashsize; i++) {
1261 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1262 			tcf_block_put(cl->block);
1263 			cl->block = NULL;
1264 		}
1265 	}
1266 	for (i = 0; i < q->clhash.hashsize; i++) {
1267 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1268 					  common.hnode)
1269 			htb_destroy_class(sch, cl);
1270 	}
1271 	qdisc_class_hash_destroy(&q->clhash);
1272 	__qdisc_reset_queue(&q->direct_queue);
1273 }
1274 
1275 static int htb_delete(struct Qdisc *sch, unsigned long arg)
1276 {
1277 	struct htb_sched *q = qdisc_priv(sch);
1278 	struct htb_class *cl = (struct htb_class *)arg;
1279 	struct Qdisc *new_q = NULL;
1280 	int last_child = 0;
1281 
1282 	/* TODO: why don't allow to delete subtree ? references ? does
1283 	 * tc subsys guarantee us that in htb_destroy it holds no class
1284 	 * refs so that we can remove children safely there ?
1285 	 */
1286 	if (cl->children || cl->filter_cnt)
1287 		return -EBUSY;
1288 
1289 	if (!cl->level && htb_parent_last_child(cl)) {
1290 		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1291 					  cl->parent->common.classid);
1292 		last_child = 1;
1293 	}
1294 
1295 	sch_tree_lock(sch);
1296 
1297 	if (!cl->level) {
1298 		unsigned int qlen = cl->un.leaf.q->q.qlen;
1299 		unsigned int backlog = cl->un.leaf.q->qstats.backlog;
1300 
1301 		qdisc_reset(cl->un.leaf.q);
1302 		qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
1303 	}
1304 
1305 	/* delete from hash and active; remainder in destroy_class */
1306 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1307 	if (cl->parent)
1308 		cl->parent->children--;
1309 
1310 	if (cl->prio_activity)
1311 		htb_deactivate(q, cl);
1312 
1313 	if (cl->cmode != HTB_CAN_SEND)
1314 		htb_safe_rb_erase(&cl->pq_node,
1315 				  &q->hlevel[cl->level].wait_pq);
1316 
1317 	if (last_child)
1318 		htb_parent_to_leaf(q, cl, new_q);
1319 
1320 	sch_tree_unlock(sch);
1321 
1322 	htb_destroy_class(sch, cl);
1323 	return 0;
1324 }
1325 
1326 static int htb_change_class(struct Qdisc *sch, u32 classid,
1327 			    u32 parentid, struct nlattr **tca,
1328 			    unsigned long *arg)
1329 {
1330 	int err = -EINVAL;
1331 	struct htb_sched *q = qdisc_priv(sch);
1332 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1333 	struct nlattr *opt = tca[TCA_OPTIONS];
1334 	struct nlattr *tb[TCA_HTB_MAX + 1];
1335 	struct tc_htb_opt *hopt;
1336 	u64 rate64, ceil64;
1337 
1338 	/* extract all subattrs from opt attr */
1339 	if (!opt)
1340 		goto failure;
1341 
1342 	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL);
1343 	if (err < 0)
1344 		goto failure;
1345 
1346 	err = -EINVAL;
1347 	if (tb[TCA_HTB_PARMS] == NULL)
1348 		goto failure;
1349 
1350 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1351 
1352 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1353 	if (!hopt->rate.rate || !hopt->ceil.rate)
1354 		goto failure;
1355 
1356 	/* Keeping backward compatible with rate_table based iproute2 tc */
1357 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1358 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]));
1359 
1360 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1361 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]));
1362 
1363 	if (!cl) {		/* new class */
1364 		struct Qdisc *new_q;
1365 		int prio;
1366 		struct {
1367 			struct nlattr		nla;
1368 			struct gnet_estimator	opt;
1369 		} est = {
1370 			.nla = {
1371 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1372 				.nla_type	= TCA_RATE,
1373 			},
1374 			.opt = {
1375 				/* 4s interval, 16s averaging constant */
1376 				.interval	= 2,
1377 				.ewma_log	= 2,
1378 			},
1379 		};
1380 
1381 		/* check for valid classid */
1382 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1383 		    htb_find(classid, sch))
1384 			goto failure;
1385 
1386 		/* check maximal depth */
1387 		if (parent && parent->parent && parent->parent->level < 2) {
1388 			pr_err("htb: tree is too deep\n");
1389 			goto failure;
1390 		}
1391 		err = -ENOBUFS;
1392 		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1393 		if (!cl)
1394 			goto failure;
1395 
1396 		err = tcf_block_get(&cl->block, &cl->filter_list);
1397 		if (err) {
1398 			kfree(cl);
1399 			goto failure;
1400 		}
1401 		if (htb_rate_est || tca[TCA_RATE]) {
1402 			err = gen_new_estimator(&cl->bstats, NULL,
1403 						&cl->rate_est,
1404 						NULL,
1405 						qdisc_root_sleeping_running(sch),
1406 						tca[TCA_RATE] ? : &est.nla);
1407 			if (err) {
1408 				tcf_block_put(cl->block);
1409 				kfree(cl);
1410 				goto failure;
1411 			}
1412 		}
1413 
1414 		cl->children = 0;
1415 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1416 		RB_CLEAR_NODE(&cl->pq_node);
1417 
1418 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1419 			RB_CLEAR_NODE(&cl->node[prio]);
1420 
1421 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1422 		 * so that can't be used inside of sch_tree_lock
1423 		 * -- thanks to Karlis Peisenieks
1424 		 */
1425 		new_q = qdisc_create_dflt(sch->dev_queue,
1426 					  &pfifo_qdisc_ops, classid);
1427 		sch_tree_lock(sch);
1428 		if (parent && !parent->level) {
1429 			unsigned int qlen = parent->un.leaf.q->q.qlen;
1430 			unsigned int backlog = parent->un.leaf.q->qstats.backlog;
1431 
1432 			/* turn parent into inner node */
1433 			qdisc_reset(parent->un.leaf.q);
1434 			qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
1435 			qdisc_destroy(parent->un.leaf.q);
1436 			if (parent->prio_activity)
1437 				htb_deactivate(q, parent);
1438 
1439 			/* remove from evt list because of level change */
1440 			if (parent->cmode != HTB_CAN_SEND) {
1441 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1442 				parent->cmode = HTB_CAN_SEND;
1443 			}
1444 			parent->level = (parent->parent ? parent->parent->level
1445 					 : TC_HTB_MAXDEPTH) - 1;
1446 			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1447 		}
1448 		/* leaf (we) needs elementary qdisc */
1449 		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1450 
1451 		cl->common.classid = classid;
1452 		cl->parent = parent;
1453 
1454 		/* set class to be in HTB_CAN_SEND state */
1455 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1456 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1457 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1458 		cl->t_c = ktime_get_ns();
1459 		cl->cmode = HTB_CAN_SEND;
1460 
1461 		/* attach to the hash list and parent's family */
1462 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1463 		if (parent)
1464 			parent->children++;
1465 		if (cl->un.leaf.q != &noop_qdisc)
1466 			qdisc_hash_add(cl->un.leaf.q, true);
1467 	} else {
1468 		if (tca[TCA_RATE]) {
1469 			err = gen_replace_estimator(&cl->bstats, NULL,
1470 						    &cl->rate_est,
1471 						    NULL,
1472 						    qdisc_root_sleeping_running(sch),
1473 						    tca[TCA_RATE]);
1474 			if (err)
1475 				return err;
1476 		}
1477 		sch_tree_lock(sch);
1478 	}
1479 
1480 	rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0;
1481 
1482 	ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0;
1483 
1484 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
1485 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
1486 
1487 	/* it used to be a nasty bug here, we have to check that node
1488 	 * is really leaf before changing cl->un.leaf !
1489 	 */
1490 	if (!cl->level) {
1491 		u64 quantum = cl->rate.rate_bytes_ps;
1492 
1493 		do_div(quantum, q->rate2quantum);
1494 		cl->quantum = min_t(u64, quantum, INT_MAX);
1495 
1496 		if (!hopt->quantum && cl->quantum < 1000) {
1497 			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
1498 				cl->common.classid);
1499 			cl->quantum = 1000;
1500 		}
1501 		if (!hopt->quantum && cl->quantum > 200000) {
1502 			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
1503 				cl->common.classid);
1504 			cl->quantum = 200000;
1505 		}
1506 		if (hopt->quantum)
1507 			cl->quantum = hopt->quantum;
1508 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1509 			cl->prio = TC_HTB_NUMPRIO - 1;
1510 	}
1511 
1512 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
1513 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
1514 
1515 	sch_tree_unlock(sch);
1516 
1517 	qdisc_class_hash_grow(sch, &q->clhash);
1518 
1519 	*arg = (unsigned long)cl;
1520 	return 0;
1521 
1522 failure:
1523 	return err;
1524 }
1525 
1526 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg)
1527 {
1528 	struct htb_sched *q = qdisc_priv(sch);
1529 	struct htb_class *cl = (struct htb_class *)arg;
1530 
1531 	return cl ? cl->block : q->block;
1532 }
1533 
1534 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1535 				     u32 classid)
1536 {
1537 	struct htb_class *cl = htb_find(classid, sch);
1538 
1539 	/*if (cl && !cl->level) return 0;
1540 	 * The line above used to be there to prevent attaching filters to
1541 	 * leaves. But at least tc_index filter uses this just to get class
1542 	 * for other reasons so that we have to allow for it.
1543 	 * ----
1544 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
1545 	 * another way to "lock" the class - unlike "get" this lock can
1546 	 * be broken by class during destroy IIUC.
1547 	 */
1548 	if (cl)
1549 		cl->filter_cnt++;
1550 	return (unsigned long)cl;
1551 }
1552 
1553 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1554 {
1555 	struct htb_class *cl = (struct htb_class *)arg;
1556 
1557 	if (cl)
1558 		cl->filter_cnt--;
1559 }
1560 
1561 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1562 {
1563 	struct htb_sched *q = qdisc_priv(sch);
1564 	struct htb_class *cl;
1565 	unsigned int i;
1566 
1567 	if (arg->stop)
1568 		return;
1569 
1570 	for (i = 0; i < q->clhash.hashsize; i++) {
1571 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1572 			if (arg->count < arg->skip) {
1573 				arg->count++;
1574 				continue;
1575 			}
1576 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1577 				arg->stop = 1;
1578 				return;
1579 			}
1580 			arg->count++;
1581 		}
1582 	}
1583 }
1584 
1585 static const struct Qdisc_class_ops htb_class_ops = {
1586 	.graft		=	htb_graft,
1587 	.leaf		=	htb_leaf,
1588 	.qlen_notify	=	htb_qlen_notify,
1589 	.find		=	htb_search,
1590 	.change		=	htb_change_class,
1591 	.delete		=	htb_delete,
1592 	.walk		=	htb_walk,
1593 	.tcf_block	=	htb_tcf_block,
1594 	.bind_tcf	=	htb_bind_filter,
1595 	.unbind_tcf	=	htb_unbind_filter,
1596 	.dump		=	htb_dump_class,
1597 	.dump_stats	=	htb_dump_class_stats,
1598 };
1599 
1600 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1601 	.cl_ops		=	&htb_class_ops,
1602 	.id		=	"htb",
1603 	.priv_size	=	sizeof(struct htb_sched),
1604 	.enqueue	=	htb_enqueue,
1605 	.dequeue	=	htb_dequeue,
1606 	.peek		=	qdisc_peek_dequeued,
1607 	.init		=	htb_init,
1608 	.reset		=	htb_reset,
1609 	.destroy	=	htb_destroy,
1610 	.dump		=	htb_dump,
1611 	.owner		=	THIS_MODULE,
1612 };
1613 
1614 static int __init htb_module_init(void)
1615 {
1616 	return register_qdisc(&htb_qdisc_ops);
1617 }
1618 static void __exit htb_module_exit(void)
1619 {
1620 	unregister_qdisc(&htb_qdisc_ops);
1621 }
1622 
1623 module_init(htb_module_init)
1624 module_exit(htb_module_exit)
1625 MODULE_LICENSE("GPL");
1626