xref: /openbmc/linux/net/sched/sch_api.c (revision a86854d0)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
599 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
600 {
601 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
602 	wd->timer.function = qdisc_watchdog;
603 	wd->qdisc = qdisc;
604 }
605 EXPORT_SYMBOL(qdisc_watchdog_init);
606 
607 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
608 {
609 	if (test_bit(__QDISC_STATE_DEACTIVATED,
610 		     &qdisc_root_sleeping(wd->qdisc)->state))
611 		return;
612 
613 	if (wd->last_expires == expires)
614 		return;
615 
616 	wd->last_expires = expires;
617 	hrtimer_start(&wd->timer,
618 		      ns_to_ktime(expires),
619 		      HRTIMER_MODE_ABS_PINNED);
620 }
621 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
622 
623 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
624 {
625 	hrtimer_cancel(&wd->timer);
626 }
627 EXPORT_SYMBOL(qdisc_watchdog_cancel);
628 
629 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
630 {
631 	struct hlist_head *h;
632 	unsigned int i;
633 
634 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
635 
636 	if (h != NULL) {
637 		for (i = 0; i < n; i++)
638 			INIT_HLIST_HEAD(&h[i]);
639 	}
640 	return h;
641 }
642 
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
644 {
645 	struct Qdisc_class_common *cl;
646 	struct hlist_node *next;
647 	struct hlist_head *nhash, *ohash;
648 	unsigned int nsize, nmask, osize;
649 	unsigned int i, h;
650 
651 	/* Rehash when load factor exceeds 0.75 */
652 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653 		return;
654 	nsize = clhash->hashsize * 2;
655 	nmask = nsize - 1;
656 	nhash = qdisc_class_hash_alloc(nsize);
657 	if (nhash == NULL)
658 		return;
659 
660 	ohash = clhash->hash;
661 	osize = clhash->hashsize;
662 
663 	sch_tree_lock(sch);
664 	for (i = 0; i < osize; i++) {
665 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666 			h = qdisc_class_hash(cl->classid, nmask);
667 			hlist_add_head(&cl->hnode, &nhash[h]);
668 		}
669 	}
670 	clhash->hash     = nhash;
671 	clhash->hashsize = nsize;
672 	clhash->hashmask = nmask;
673 	sch_tree_unlock(sch);
674 
675 	kvfree(ohash);
676 }
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
678 
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
680 {
681 	unsigned int size = 4;
682 
683 	clhash->hash = qdisc_class_hash_alloc(size);
684 	if (!clhash->hash)
685 		return -ENOMEM;
686 	clhash->hashsize  = size;
687 	clhash->hashmask  = size - 1;
688 	clhash->hashelems = 0;
689 	return 0;
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_init);
692 
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
694 {
695 	kvfree(clhash->hash);
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
698 
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700 			     struct Qdisc_class_common *cl)
701 {
702 	unsigned int h;
703 
704 	INIT_HLIST_NODE(&cl->hnode);
705 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
706 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
707 	clhash->hashelems++;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
710 
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712 			     struct Qdisc_class_common *cl)
713 {
714 	hlist_del(&cl->hnode);
715 	clhash->hashelems--;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
718 
719 /* Allocate an unique handle from space managed by kernel
720  * Possible range is [8000-FFFF]:0000 (0x8000 values)
721  */
722 static u32 qdisc_alloc_handle(struct net_device *dev)
723 {
724 	int i = 0x8000;
725 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
726 
727 	do {
728 		autohandle += TC_H_MAKE(0x10000U, 0);
729 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730 			autohandle = TC_H_MAKE(0x80000000U, 0);
731 		if (!qdisc_lookup(dev, autohandle))
732 			return autohandle;
733 		cond_resched();
734 	} while	(--i > 0);
735 
736 	return 0;
737 }
738 
739 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
740 			       unsigned int len)
741 {
742 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
743 	const struct Qdisc_class_ops *cops;
744 	unsigned long cl;
745 	u32 parentid;
746 	bool notify;
747 	int drops;
748 
749 	if (n == 0 && len == 0)
750 		return;
751 	drops = max_t(int, n, 0);
752 	rcu_read_lock();
753 	while ((parentid = sch->parent)) {
754 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
755 			break;
756 
757 		if (sch->flags & TCQ_F_NOPARENT)
758 			break;
759 		/* Notify parent qdisc only if child qdisc becomes empty.
760 		 *
761 		 * If child was empty even before update then backlog
762 		 * counter is screwed and we skip notification because
763 		 * parent class is already passive.
764 		 *
765 		 * If the original child was offloaded then it is allowed
766 		 * to be seem as empty, so the parent is notified anyway.
767 		 */
768 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
769 						       !qdisc_is_offloaded);
770 		/* TODO: perform the search on a per txq basis */
771 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
772 		if (sch == NULL) {
773 			WARN_ON_ONCE(parentid != TC_H_ROOT);
774 			break;
775 		}
776 		cops = sch->ops->cl_ops;
777 		if (notify && cops->qlen_notify) {
778 			cl = cops->find(sch, parentid);
779 			cops->qlen_notify(sch, cl);
780 		}
781 		sch->q.qlen -= n;
782 		sch->qstats.backlog -= len;
783 		__qdisc_qstats_drop(sch, drops);
784 	}
785 	rcu_read_unlock();
786 }
787 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
788 
789 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
790 			 u32 portid, u32 seq, u16 flags, int event)
791 {
792 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
793 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
794 	struct tcmsg *tcm;
795 	struct nlmsghdr  *nlh;
796 	unsigned char *b = skb_tail_pointer(skb);
797 	struct gnet_dump d;
798 	struct qdisc_size_table *stab;
799 	u32 block_index;
800 	__u32 qlen;
801 
802 	cond_resched();
803 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
804 	if (!nlh)
805 		goto out_nlmsg_trim;
806 	tcm = nlmsg_data(nlh);
807 	tcm->tcm_family = AF_UNSPEC;
808 	tcm->tcm__pad1 = 0;
809 	tcm->tcm__pad2 = 0;
810 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
811 	tcm->tcm_parent = clid;
812 	tcm->tcm_handle = q->handle;
813 	tcm->tcm_info = refcount_read(&q->refcnt);
814 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
815 		goto nla_put_failure;
816 	if (q->ops->ingress_block_get) {
817 		block_index = q->ops->ingress_block_get(q);
818 		if (block_index &&
819 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
820 			goto nla_put_failure;
821 	}
822 	if (q->ops->egress_block_get) {
823 		block_index = q->ops->egress_block_get(q);
824 		if (block_index &&
825 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
826 			goto nla_put_failure;
827 	}
828 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
829 		goto nla_put_failure;
830 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
831 		goto nla_put_failure;
832 	qlen = qdisc_qlen_sum(q);
833 
834 	stab = rtnl_dereference(q->stab);
835 	if (stab && qdisc_dump_stab(skb, stab) < 0)
836 		goto nla_put_failure;
837 
838 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
839 					 NULL, &d, TCA_PAD) < 0)
840 		goto nla_put_failure;
841 
842 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
843 		goto nla_put_failure;
844 
845 	if (qdisc_is_percpu_stats(q)) {
846 		cpu_bstats = q->cpu_bstats;
847 		cpu_qstats = q->cpu_qstats;
848 	}
849 
850 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
851 				  &d, cpu_bstats, &q->bstats) < 0 ||
852 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
853 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
854 		goto nla_put_failure;
855 
856 	if (gnet_stats_finish_copy(&d) < 0)
857 		goto nla_put_failure;
858 
859 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
860 	return skb->len;
861 
862 out_nlmsg_trim:
863 nla_put_failure:
864 	nlmsg_trim(skb, b);
865 	return -1;
866 }
867 
868 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
869 {
870 	if (q->flags & TCQ_F_BUILTIN)
871 		return true;
872 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
873 		return true;
874 
875 	return false;
876 }
877 
878 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
879 			struct nlmsghdr *n, u32 clid,
880 			struct Qdisc *old, struct Qdisc *new)
881 {
882 	struct sk_buff *skb;
883 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
884 
885 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
886 	if (!skb)
887 		return -ENOBUFS;
888 
889 	if (old && !tc_qdisc_dump_ignore(old, false)) {
890 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
891 				  0, RTM_DELQDISC) < 0)
892 			goto err_out;
893 	}
894 	if (new && !tc_qdisc_dump_ignore(new, false)) {
895 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
896 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
897 			goto err_out;
898 	}
899 
900 	if (skb->len)
901 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
902 				      n->nlmsg_flags & NLM_F_ECHO);
903 
904 err_out:
905 	kfree_skb(skb);
906 	return -EINVAL;
907 }
908 
909 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
910 			       struct nlmsghdr *n, u32 clid,
911 			       struct Qdisc *old, struct Qdisc *new)
912 {
913 	if (new || old)
914 		qdisc_notify(net, skb, n, clid, old, new);
915 
916 	if (old)
917 		qdisc_destroy(old);
918 }
919 
920 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
921  * to device "dev".
922  *
923  * When appropriate send a netlink notification using 'skb'
924  * and "n".
925  *
926  * On success, destroy old qdisc.
927  */
928 
929 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
930 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
931 		       struct Qdisc *new, struct Qdisc *old,
932 		       struct netlink_ext_ack *extack)
933 {
934 	struct Qdisc *q = old;
935 	struct net *net = dev_net(dev);
936 	int err = 0;
937 
938 	if (parent == NULL) {
939 		unsigned int i, num_q, ingress;
940 
941 		ingress = 0;
942 		num_q = dev->num_tx_queues;
943 		if ((q && q->flags & TCQ_F_INGRESS) ||
944 		    (new && new->flags & TCQ_F_INGRESS)) {
945 			num_q = 1;
946 			ingress = 1;
947 			if (!dev_ingress_queue(dev)) {
948 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
949 				return -ENOENT;
950 			}
951 		}
952 
953 		if (dev->flags & IFF_UP)
954 			dev_deactivate(dev);
955 
956 		if (new && new->ops->attach)
957 			goto skip;
958 
959 		for (i = 0; i < num_q; i++) {
960 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
961 
962 			if (!ingress)
963 				dev_queue = netdev_get_tx_queue(dev, i);
964 
965 			old = dev_graft_qdisc(dev_queue, new);
966 			if (new && i > 0)
967 				qdisc_refcount_inc(new);
968 
969 			if (!ingress)
970 				qdisc_destroy(old);
971 		}
972 
973 skip:
974 		if (!ingress) {
975 			notify_and_destroy(net, skb, n, classid,
976 					   dev->qdisc, new);
977 			if (new && !new->ops->attach)
978 				qdisc_refcount_inc(new);
979 			dev->qdisc = new ? : &noop_qdisc;
980 
981 			if (new && new->ops->attach)
982 				new->ops->attach(new);
983 		} else {
984 			notify_and_destroy(net, skb, n, classid, old, new);
985 		}
986 
987 		if (dev->flags & IFF_UP)
988 			dev_activate(dev);
989 	} else {
990 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
991 
992 		/* Only support running class lockless if parent is lockless */
993 		if (new && (new->flags & TCQ_F_NOLOCK) &&
994 		    parent && !(parent->flags & TCQ_F_NOLOCK))
995 			new->flags &= ~TCQ_F_NOLOCK;
996 
997 		err = -EOPNOTSUPP;
998 		if (cops && cops->graft) {
999 			unsigned long cl = cops->find(parent, classid);
1000 
1001 			if (cl) {
1002 				err = cops->graft(parent, cl, new, &old,
1003 						  extack);
1004 			} else {
1005 				NL_SET_ERR_MSG(extack, "Specified class not found");
1006 				err = -ENOENT;
1007 			}
1008 		}
1009 		if (!err)
1010 			notify_and_destroy(net, skb, n, classid, old, new);
1011 	}
1012 	return err;
1013 }
1014 
1015 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1016 				   struct netlink_ext_ack *extack)
1017 {
1018 	u32 block_index;
1019 
1020 	if (tca[TCA_INGRESS_BLOCK]) {
1021 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1022 
1023 		if (!block_index) {
1024 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1025 			return -EINVAL;
1026 		}
1027 		if (!sch->ops->ingress_block_set) {
1028 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1029 			return -EOPNOTSUPP;
1030 		}
1031 		sch->ops->ingress_block_set(sch, block_index);
1032 	}
1033 	if (tca[TCA_EGRESS_BLOCK]) {
1034 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1035 
1036 		if (!block_index) {
1037 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1038 			return -EINVAL;
1039 		}
1040 		if (!sch->ops->egress_block_set) {
1041 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1042 			return -EOPNOTSUPP;
1043 		}
1044 		sch->ops->egress_block_set(sch, block_index);
1045 	}
1046 	return 0;
1047 }
1048 
1049 /* lockdep annotation is needed for ingress; egress gets it only for name */
1050 static struct lock_class_key qdisc_tx_lock;
1051 static struct lock_class_key qdisc_rx_lock;
1052 
1053 /*
1054    Allocate and initialize new qdisc.
1055 
1056    Parameters are passed via opt.
1057  */
1058 
1059 static struct Qdisc *qdisc_create(struct net_device *dev,
1060 				  struct netdev_queue *dev_queue,
1061 				  struct Qdisc *p, u32 parent, u32 handle,
1062 				  struct nlattr **tca, int *errp,
1063 				  struct netlink_ext_ack *extack)
1064 {
1065 	int err;
1066 	struct nlattr *kind = tca[TCA_KIND];
1067 	struct Qdisc *sch;
1068 	struct Qdisc_ops *ops;
1069 	struct qdisc_size_table *stab;
1070 
1071 	ops = qdisc_lookup_ops(kind);
1072 #ifdef CONFIG_MODULES
1073 	if (ops == NULL && kind != NULL) {
1074 		char name[IFNAMSIZ];
1075 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1076 			/* We dropped the RTNL semaphore in order to
1077 			 * perform the module load.  So, even if we
1078 			 * succeeded in loading the module we have to
1079 			 * tell the caller to replay the request.  We
1080 			 * indicate this using -EAGAIN.
1081 			 * We replay the request because the device may
1082 			 * go away in the mean time.
1083 			 */
1084 			rtnl_unlock();
1085 			request_module("sch_%s", name);
1086 			rtnl_lock();
1087 			ops = qdisc_lookup_ops(kind);
1088 			if (ops != NULL) {
1089 				/* We will try again qdisc_lookup_ops,
1090 				 * so don't keep a reference.
1091 				 */
1092 				module_put(ops->owner);
1093 				err = -EAGAIN;
1094 				goto err_out;
1095 			}
1096 		}
1097 	}
1098 #endif
1099 
1100 	err = -ENOENT;
1101 	if (!ops) {
1102 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1103 		goto err_out;
1104 	}
1105 
1106 	sch = qdisc_alloc(dev_queue, ops, extack);
1107 	if (IS_ERR(sch)) {
1108 		err = PTR_ERR(sch);
1109 		goto err_out2;
1110 	}
1111 
1112 	sch->parent = parent;
1113 
1114 	if (handle == TC_H_INGRESS) {
1115 		sch->flags |= TCQ_F_INGRESS;
1116 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1117 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1118 	} else {
1119 		if (handle == 0) {
1120 			handle = qdisc_alloc_handle(dev);
1121 			err = -ENOMEM;
1122 			if (handle == 0)
1123 				goto err_out3;
1124 		}
1125 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1126 		if (!netif_is_multiqueue(dev))
1127 			sch->flags |= TCQ_F_ONETXQUEUE;
1128 	}
1129 
1130 	sch->handle = handle;
1131 
1132 	/* This exist to keep backward compatible with a userspace
1133 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1134 	 * facility on older kernels by setting tx_queue_len=0 (prior
1135 	 * to qdisc init), and then forgot to reinit tx_queue_len
1136 	 * before again attaching a qdisc.
1137 	 */
1138 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1139 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1140 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1141 	}
1142 
1143 	err = qdisc_block_indexes_set(sch, tca, extack);
1144 	if (err)
1145 		goto err_out3;
1146 
1147 	if (ops->init) {
1148 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1149 		if (err != 0)
1150 			goto err_out5;
1151 	}
1152 
1153 	if (tca[TCA_STAB]) {
1154 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1155 		if (IS_ERR(stab)) {
1156 			err = PTR_ERR(stab);
1157 			goto err_out4;
1158 		}
1159 		rcu_assign_pointer(sch->stab, stab);
1160 	}
1161 	if (tca[TCA_RATE]) {
1162 		seqcount_t *running;
1163 
1164 		err = -EOPNOTSUPP;
1165 		if (sch->flags & TCQ_F_MQROOT) {
1166 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1167 			goto err_out4;
1168 		}
1169 
1170 		if (sch->parent != TC_H_ROOT &&
1171 		    !(sch->flags & TCQ_F_INGRESS) &&
1172 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1173 			running = qdisc_root_sleeping_running(sch);
1174 		else
1175 			running = &sch->running;
1176 
1177 		err = gen_new_estimator(&sch->bstats,
1178 					sch->cpu_bstats,
1179 					&sch->rate_est,
1180 					NULL,
1181 					running,
1182 					tca[TCA_RATE]);
1183 		if (err) {
1184 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1185 			goto err_out4;
1186 		}
1187 	}
1188 
1189 	qdisc_hash_add(sch, false);
1190 
1191 	return sch;
1192 
1193 err_out5:
1194 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1195 	if (ops->destroy)
1196 		ops->destroy(sch);
1197 err_out3:
1198 	dev_put(dev);
1199 	qdisc_free(sch);
1200 err_out2:
1201 	module_put(ops->owner);
1202 err_out:
1203 	*errp = err;
1204 	return NULL;
1205 
1206 err_out4:
1207 	/*
1208 	 * Any broken qdiscs that would require a ops->reset() here?
1209 	 * The qdisc was never in action so it shouldn't be necessary.
1210 	 */
1211 	qdisc_put_stab(rtnl_dereference(sch->stab));
1212 	if (ops->destroy)
1213 		ops->destroy(sch);
1214 	goto err_out3;
1215 }
1216 
1217 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1218 			struct netlink_ext_ack *extack)
1219 {
1220 	struct qdisc_size_table *ostab, *stab = NULL;
1221 	int err = 0;
1222 
1223 	if (tca[TCA_OPTIONS]) {
1224 		if (!sch->ops->change) {
1225 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1226 			return -EINVAL;
1227 		}
1228 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1229 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1230 			return -EOPNOTSUPP;
1231 		}
1232 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1233 		if (err)
1234 			return err;
1235 	}
1236 
1237 	if (tca[TCA_STAB]) {
1238 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1239 		if (IS_ERR(stab))
1240 			return PTR_ERR(stab);
1241 	}
1242 
1243 	ostab = rtnl_dereference(sch->stab);
1244 	rcu_assign_pointer(sch->stab, stab);
1245 	qdisc_put_stab(ostab);
1246 
1247 	if (tca[TCA_RATE]) {
1248 		/* NB: ignores errors from replace_estimator
1249 		   because change can't be undone. */
1250 		if (sch->flags & TCQ_F_MQROOT)
1251 			goto out;
1252 		gen_replace_estimator(&sch->bstats,
1253 				      sch->cpu_bstats,
1254 				      &sch->rate_est,
1255 				      NULL,
1256 				      qdisc_root_sleeping_running(sch),
1257 				      tca[TCA_RATE]);
1258 	}
1259 out:
1260 	return 0;
1261 }
1262 
1263 struct check_loop_arg {
1264 	struct qdisc_walker	w;
1265 	struct Qdisc		*p;
1266 	int			depth;
1267 };
1268 
1269 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1270 			 struct qdisc_walker *w);
1271 
1272 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1273 {
1274 	struct check_loop_arg	arg;
1275 
1276 	if (q->ops->cl_ops == NULL)
1277 		return 0;
1278 
1279 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1280 	arg.w.fn = check_loop_fn;
1281 	arg.depth = depth;
1282 	arg.p = p;
1283 	q->ops->cl_ops->walk(q, &arg.w);
1284 	return arg.w.stop ? -ELOOP : 0;
1285 }
1286 
1287 static int
1288 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1289 {
1290 	struct Qdisc *leaf;
1291 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1292 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1293 
1294 	leaf = cops->leaf(q, cl);
1295 	if (leaf) {
1296 		if (leaf == arg->p || arg->depth > 7)
1297 			return -ELOOP;
1298 		return check_loop(leaf, arg->p, arg->depth + 1);
1299 	}
1300 	return 0;
1301 }
1302 
1303 /*
1304  * Delete/get qdisc.
1305  */
1306 
1307 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1308 			struct netlink_ext_ack *extack)
1309 {
1310 	struct net *net = sock_net(skb->sk);
1311 	struct tcmsg *tcm = nlmsg_data(n);
1312 	struct nlattr *tca[TCA_MAX + 1];
1313 	struct net_device *dev;
1314 	u32 clid;
1315 	struct Qdisc *q = NULL;
1316 	struct Qdisc *p = NULL;
1317 	int err;
1318 
1319 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1320 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1321 		return -EPERM;
1322 
1323 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1324 	if (err < 0)
1325 		return err;
1326 
1327 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1328 	if (!dev)
1329 		return -ENODEV;
1330 
1331 	clid = tcm->tcm_parent;
1332 	if (clid) {
1333 		if (clid != TC_H_ROOT) {
1334 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1335 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1336 				if (!p) {
1337 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1338 					return -ENOENT;
1339 				}
1340 				q = qdisc_leaf(p, clid);
1341 			} else if (dev_ingress_queue(dev)) {
1342 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1343 			}
1344 		} else {
1345 			q = dev->qdisc;
1346 		}
1347 		if (!q) {
1348 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1349 			return -ENOENT;
1350 		}
1351 
1352 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1353 			NL_SET_ERR_MSG(extack, "Invalid handle");
1354 			return -EINVAL;
1355 		}
1356 	} else {
1357 		q = qdisc_lookup(dev, tcm->tcm_handle);
1358 		if (!q) {
1359 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1360 			return -ENOENT;
1361 		}
1362 	}
1363 
1364 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1365 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1366 		return -EINVAL;
1367 	}
1368 
1369 	if (n->nlmsg_type == RTM_DELQDISC) {
1370 		if (!clid) {
1371 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1372 			return -EINVAL;
1373 		}
1374 		if (q->handle == 0) {
1375 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1376 			return -ENOENT;
1377 		}
1378 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1379 		if (err != 0)
1380 			return err;
1381 	} else {
1382 		qdisc_notify(net, skb, n, clid, NULL, q);
1383 	}
1384 	return 0;
1385 }
1386 
1387 /*
1388  * Create/change qdisc.
1389  */
1390 
1391 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1392 			   struct netlink_ext_ack *extack)
1393 {
1394 	struct net *net = sock_net(skb->sk);
1395 	struct tcmsg *tcm;
1396 	struct nlattr *tca[TCA_MAX + 1];
1397 	struct net_device *dev;
1398 	u32 clid;
1399 	struct Qdisc *q, *p;
1400 	int err;
1401 
1402 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1403 		return -EPERM;
1404 
1405 replay:
1406 	/* Reinit, just in case something touches this. */
1407 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1408 	if (err < 0)
1409 		return err;
1410 
1411 	tcm = nlmsg_data(n);
1412 	clid = tcm->tcm_parent;
1413 	q = p = NULL;
1414 
1415 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1416 	if (!dev)
1417 		return -ENODEV;
1418 
1419 
1420 	if (clid) {
1421 		if (clid != TC_H_ROOT) {
1422 			if (clid != TC_H_INGRESS) {
1423 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1424 				if (!p) {
1425 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1426 					return -ENOENT;
1427 				}
1428 				q = qdisc_leaf(p, clid);
1429 			} else if (dev_ingress_queue_create(dev)) {
1430 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1431 			}
1432 		} else {
1433 			q = dev->qdisc;
1434 		}
1435 
1436 		/* It may be default qdisc, ignore it */
1437 		if (q && q->handle == 0)
1438 			q = NULL;
1439 
1440 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1441 			if (tcm->tcm_handle) {
1442 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1443 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1444 					return -EEXIST;
1445 				}
1446 				if (TC_H_MIN(tcm->tcm_handle)) {
1447 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1448 					return -EINVAL;
1449 				}
1450 				q = qdisc_lookup(dev, tcm->tcm_handle);
1451 				if (!q)
1452 					goto create_n_graft;
1453 				if (n->nlmsg_flags & NLM_F_EXCL) {
1454 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1455 					return -EEXIST;
1456 				}
1457 				if (tca[TCA_KIND] &&
1458 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1459 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1460 					return -EINVAL;
1461 				}
1462 				if (q == p ||
1463 				    (p && check_loop(q, p, 0))) {
1464 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1465 					return -ELOOP;
1466 				}
1467 				qdisc_refcount_inc(q);
1468 				goto graft;
1469 			} else {
1470 				if (!q)
1471 					goto create_n_graft;
1472 
1473 				/* This magic test requires explanation.
1474 				 *
1475 				 *   We know, that some child q is already
1476 				 *   attached to this parent and have choice:
1477 				 *   either to change it or to create/graft new one.
1478 				 *
1479 				 *   1. We are allowed to create/graft only
1480 				 *   if CREATE and REPLACE flags are set.
1481 				 *
1482 				 *   2. If EXCL is set, requestor wanted to say,
1483 				 *   that qdisc tcm_handle is not expected
1484 				 *   to exist, so that we choose create/graft too.
1485 				 *
1486 				 *   3. The last case is when no flags are set.
1487 				 *   Alas, it is sort of hole in API, we
1488 				 *   cannot decide what to do unambiguously.
1489 				 *   For now we select create/graft, if
1490 				 *   user gave KIND, which does not match existing.
1491 				 */
1492 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1493 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1494 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1495 				     (tca[TCA_KIND] &&
1496 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1497 					goto create_n_graft;
1498 			}
1499 		}
1500 	} else {
1501 		if (!tcm->tcm_handle) {
1502 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1503 			return -EINVAL;
1504 		}
1505 		q = qdisc_lookup(dev, tcm->tcm_handle);
1506 	}
1507 
1508 	/* Change qdisc parameters */
1509 	if (!q) {
1510 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1511 		return -ENOENT;
1512 	}
1513 	if (n->nlmsg_flags & NLM_F_EXCL) {
1514 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1515 		return -EEXIST;
1516 	}
1517 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1518 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1519 		return -EINVAL;
1520 	}
1521 	err = qdisc_change(q, tca, extack);
1522 	if (err == 0)
1523 		qdisc_notify(net, skb, n, clid, NULL, q);
1524 	return err;
1525 
1526 create_n_graft:
1527 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1528 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1529 		return -ENOENT;
1530 	}
1531 	if (clid == TC_H_INGRESS) {
1532 		if (dev_ingress_queue(dev)) {
1533 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1534 					 tcm->tcm_parent, tcm->tcm_parent,
1535 					 tca, &err, extack);
1536 		} else {
1537 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1538 			err = -ENOENT;
1539 		}
1540 	} else {
1541 		struct netdev_queue *dev_queue;
1542 
1543 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1544 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1545 		else if (p)
1546 			dev_queue = p->dev_queue;
1547 		else
1548 			dev_queue = netdev_get_tx_queue(dev, 0);
1549 
1550 		q = qdisc_create(dev, dev_queue, p,
1551 				 tcm->tcm_parent, tcm->tcm_handle,
1552 				 tca, &err, extack);
1553 	}
1554 	if (q == NULL) {
1555 		if (err == -EAGAIN)
1556 			goto replay;
1557 		return err;
1558 	}
1559 
1560 graft:
1561 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1562 	if (err) {
1563 		if (q)
1564 			qdisc_destroy(q);
1565 		return err;
1566 	}
1567 
1568 	return 0;
1569 }
1570 
1571 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1572 			      struct netlink_callback *cb,
1573 			      int *q_idx_p, int s_q_idx, bool recur,
1574 			      bool dump_invisible)
1575 {
1576 	int ret = 0, q_idx = *q_idx_p;
1577 	struct Qdisc *q;
1578 	int b;
1579 
1580 	if (!root)
1581 		return 0;
1582 
1583 	q = root;
1584 	if (q_idx < s_q_idx) {
1585 		q_idx++;
1586 	} else {
1587 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1588 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1589 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1590 				  RTM_NEWQDISC) <= 0)
1591 			goto done;
1592 		q_idx++;
1593 	}
1594 
1595 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1596 	 * itself has already been dumped.
1597 	 *
1598 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1599 	 * qdisc hashtable, we don't want to hit it again
1600 	 */
1601 	if (!qdisc_dev(root) || !recur)
1602 		goto out;
1603 
1604 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1605 		if (q_idx < s_q_idx) {
1606 			q_idx++;
1607 			continue;
1608 		}
1609 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1610 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1611 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1612 				  RTM_NEWQDISC) <= 0)
1613 			goto done;
1614 		q_idx++;
1615 	}
1616 
1617 out:
1618 	*q_idx_p = q_idx;
1619 	return ret;
1620 done:
1621 	ret = -1;
1622 	goto out;
1623 }
1624 
1625 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1626 {
1627 	struct net *net = sock_net(skb->sk);
1628 	int idx, q_idx;
1629 	int s_idx, s_q_idx;
1630 	struct net_device *dev;
1631 	const struct nlmsghdr *nlh = cb->nlh;
1632 	struct nlattr *tca[TCA_MAX + 1];
1633 	int err;
1634 
1635 	s_idx = cb->args[0];
1636 	s_q_idx = q_idx = cb->args[1];
1637 
1638 	idx = 0;
1639 	ASSERT_RTNL();
1640 
1641 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1642 	if (err < 0)
1643 		return err;
1644 
1645 	for_each_netdev(net, dev) {
1646 		struct netdev_queue *dev_queue;
1647 
1648 		if (idx < s_idx)
1649 			goto cont;
1650 		if (idx > s_idx)
1651 			s_q_idx = 0;
1652 		q_idx = 0;
1653 
1654 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1655 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1656 			goto done;
1657 
1658 		dev_queue = dev_ingress_queue(dev);
1659 		if (dev_queue &&
1660 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1661 				       &q_idx, s_q_idx, false,
1662 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1663 			goto done;
1664 
1665 cont:
1666 		idx++;
1667 	}
1668 
1669 done:
1670 	cb->args[0] = idx;
1671 	cb->args[1] = q_idx;
1672 
1673 	return skb->len;
1674 }
1675 
1676 
1677 
1678 /************************************************
1679  *	Traffic classes manipulation.		*
1680  ************************************************/
1681 
1682 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1683 			  unsigned long cl,
1684 			  u32 portid, u32 seq, u16 flags, int event)
1685 {
1686 	struct tcmsg *tcm;
1687 	struct nlmsghdr  *nlh;
1688 	unsigned char *b = skb_tail_pointer(skb);
1689 	struct gnet_dump d;
1690 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1691 
1692 	cond_resched();
1693 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1694 	if (!nlh)
1695 		goto out_nlmsg_trim;
1696 	tcm = nlmsg_data(nlh);
1697 	tcm->tcm_family = AF_UNSPEC;
1698 	tcm->tcm__pad1 = 0;
1699 	tcm->tcm__pad2 = 0;
1700 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1701 	tcm->tcm_parent = q->handle;
1702 	tcm->tcm_handle = q->handle;
1703 	tcm->tcm_info = 0;
1704 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1705 		goto nla_put_failure;
1706 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1707 		goto nla_put_failure;
1708 
1709 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1710 					 NULL, &d, TCA_PAD) < 0)
1711 		goto nla_put_failure;
1712 
1713 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1714 		goto nla_put_failure;
1715 
1716 	if (gnet_stats_finish_copy(&d) < 0)
1717 		goto nla_put_failure;
1718 
1719 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1720 	return skb->len;
1721 
1722 out_nlmsg_trim:
1723 nla_put_failure:
1724 	nlmsg_trim(skb, b);
1725 	return -1;
1726 }
1727 
1728 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1729 			 struct nlmsghdr *n, struct Qdisc *q,
1730 			 unsigned long cl, int event)
1731 {
1732 	struct sk_buff *skb;
1733 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1734 
1735 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1736 	if (!skb)
1737 		return -ENOBUFS;
1738 
1739 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1740 		kfree_skb(skb);
1741 		return -EINVAL;
1742 	}
1743 
1744 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1745 			      n->nlmsg_flags & NLM_F_ECHO);
1746 }
1747 
1748 static int tclass_del_notify(struct net *net,
1749 			     const struct Qdisc_class_ops *cops,
1750 			     struct sk_buff *oskb, struct nlmsghdr *n,
1751 			     struct Qdisc *q, unsigned long cl)
1752 {
1753 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1754 	struct sk_buff *skb;
1755 	int err = 0;
1756 
1757 	if (!cops->delete)
1758 		return -EOPNOTSUPP;
1759 
1760 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1761 	if (!skb)
1762 		return -ENOBUFS;
1763 
1764 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1765 			   RTM_DELTCLASS) < 0) {
1766 		kfree_skb(skb);
1767 		return -EINVAL;
1768 	}
1769 
1770 	err = cops->delete(q, cl);
1771 	if (err) {
1772 		kfree_skb(skb);
1773 		return err;
1774 	}
1775 
1776 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1777 			      n->nlmsg_flags & NLM_F_ECHO);
1778 }
1779 
1780 #ifdef CONFIG_NET_CLS
1781 
1782 struct tcf_bind_args {
1783 	struct tcf_walker w;
1784 	u32 classid;
1785 	unsigned long cl;
1786 };
1787 
1788 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1789 {
1790 	struct tcf_bind_args *a = (void *)arg;
1791 
1792 	if (tp->ops->bind_class) {
1793 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1794 
1795 		sch_tree_lock(q);
1796 		tp->ops->bind_class(n, a->classid, a->cl);
1797 		sch_tree_unlock(q);
1798 	}
1799 	return 0;
1800 }
1801 
1802 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1803 			   unsigned long new_cl)
1804 {
1805 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1806 	struct tcf_block *block;
1807 	struct tcf_chain *chain;
1808 	unsigned long cl;
1809 
1810 	cl = cops->find(q, portid);
1811 	if (!cl)
1812 		return;
1813 	block = cops->tcf_block(q, cl, NULL);
1814 	if (!block)
1815 		return;
1816 	list_for_each_entry(chain, &block->chain_list, list) {
1817 		struct tcf_proto *tp;
1818 
1819 		for (tp = rtnl_dereference(chain->filter_chain);
1820 		     tp; tp = rtnl_dereference(tp->next)) {
1821 			struct tcf_bind_args arg = {};
1822 
1823 			arg.w.fn = tcf_node_bind;
1824 			arg.classid = clid;
1825 			arg.cl = new_cl;
1826 			tp->ops->walk(tp, &arg.w);
1827 		}
1828 	}
1829 }
1830 
1831 #else
1832 
1833 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1834 			   unsigned long new_cl)
1835 {
1836 }
1837 
1838 #endif
1839 
1840 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1841 			 struct netlink_ext_ack *extack)
1842 {
1843 	struct net *net = sock_net(skb->sk);
1844 	struct tcmsg *tcm = nlmsg_data(n);
1845 	struct nlattr *tca[TCA_MAX + 1];
1846 	struct net_device *dev;
1847 	struct Qdisc *q = NULL;
1848 	const struct Qdisc_class_ops *cops;
1849 	unsigned long cl = 0;
1850 	unsigned long new_cl;
1851 	u32 portid;
1852 	u32 clid;
1853 	u32 qid;
1854 	int err;
1855 
1856 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1857 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1858 		return -EPERM;
1859 
1860 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1861 	if (err < 0)
1862 		return err;
1863 
1864 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1865 	if (!dev)
1866 		return -ENODEV;
1867 
1868 	/*
1869 	   parent == TC_H_UNSPEC - unspecified parent.
1870 	   parent == TC_H_ROOT   - class is root, which has no parent.
1871 	   parent == X:0	 - parent is root class.
1872 	   parent == X:Y	 - parent is a node in hierarchy.
1873 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1874 
1875 	   handle == 0:0	 - generate handle from kernel pool.
1876 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1877 	   handle == X:Y	 - clear.
1878 	   handle == X:0	 - root class.
1879 	 */
1880 
1881 	/* Step 1. Determine qdisc handle X:0 */
1882 
1883 	portid = tcm->tcm_parent;
1884 	clid = tcm->tcm_handle;
1885 	qid = TC_H_MAJ(clid);
1886 
1887 	if (portid != TC_H_ROOT) {
1888 		u32 qid1 = TC_H_MAJ(portid);
1889 
1890 		if (qid && qid1) {
1891 			/* If both majors are known, they must be identical. */
1892 			if (qid != qid1)
1893 				return -EINVAL;
1894 		} else if (qid1) {
1895 			qid = qid1;
1896 		} else if (qid == 0)
1897 			qid = dev->qdisc->handle;
1898 
1899 		/* Now qid is genuine qdisc handle consistent
1900 		 * both with parent and child.
1901 		 *
1902 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1903 		 */
1904 		if (portid)
1905 			portid = TC_H_MAKE(qid, portid);
1906 	} else {
1907 		if (qid == 0)
1908 			qid = dev->qdisc->handle;
1909 	}
1910 
1911 	/* OK. Locate qdisc */
1912 	q = qdisc_lookup(dev, qid);
1913 	if (!q)
1914 		return -ENOENT;
1915 
1916 	/* An check that it supports classes */
1917 	cops = q->ops->cl_ops;
1918 	if (cops == NULL)
1919 		return -EINVAL;
1920 
1921 	/* Now try to get class */
1922 	if (clid == 0) {
1923 		if (portid == TC_H_ROOT)
1924 			clid = qid;
1925 	} else
1926 		clid = TC_H_MAKE(qid, clid);
1927 
1928 	if (clid)
1929 		cl = cops->find(q, clid);
1930 
1931 	if (cl == 0) {
1932 		err = -ENOENT;
1933 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1934 		    !(n->nlmsg_flags & NLM_F_CREATE))
1935 			goto out;
1936 	} else {
1937 		switch (n->nlmsg_type) {
1938 		case RTM_NEWTCLASS:
1939 			err = -EEXIST;
1940 			if (n->nlmsg_flags & NLM_F_EXCL)
1941 				goto out;
1942 			break;
1943 		case RTM_DELTCLASS:
1944 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1945 			/* Unbind the class with flilters with 0 */
1946 			tc_bind_tclass(q, portid, clid, 0);
1947 			goto out;
1948 		case RTM_GETTCLASS:
1949 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1950 			goto out;
1951 		default:
1952 			err = -EINVAL;
1953 			goto out;
1954 		}
1955 	}
1956 
1957 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1958 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1959 		return -EOPNOTSUPP;
1960 	}
1961 
1962 	new_cl = cl;
1963 	err = -EOPNOTSUPP;
1964 	if (cops->change)
1965 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1966 	if (err == 0) {
1967 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1968 		/* We just create a new class, need to do reverse binding. */
1969 		if (cl != new_cl)
1970 			tc_bind_tclass(q, portid, clid, new_cl);
1971 	}
1972 out:
1973 	return err;
1974 }
1975 
1976 struct qdisc_dump_args {
1977 	struct qdisc_walker	w;
1978 	struct sk_buff		*skb;
1979 	struct netlink_callback	*cb;
1980 };
1981 
1982 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1983 			    struct qdisc_walker *arg)
1984 {
1985 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1986 
1987 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1988 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1989 			      RTM_NEWTCLASS);
1990 }
1991 
1992 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1993 				struct tcmsg *tcm, struct netlink_callback *cb,
1994 				int *t_p, int s_t)
1995 {
1996 	struct qdisc_dump_args arg;
1997 
1998 	if (tc_qdisc_dump_ignore(q, false) ||
1999 	    *t_p < s_t || !q->ops->cl_ops ||
2000 	    (tcm->tcm_parent &&
2001 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2002 		(*t_p)++;
2003 		return 0;
2004 	}
2005 	if (*t_p > s_t)
2006 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2007 	arg.w.fn = qdisc_class_dump;
2008 	arg.skb = skb;
2009 	arg.cb = cb;
2010 	arg.w.stop  = 0;
2011 	arg.w.skip = cb->args[1];
2012 	arg.w.count = 0;
2013 	q->ops->cl_ops->walk(q, &arg.w);
2014 	cb->args[1] = arg.w.count;
2015 	if (arg.w.stop)
2016 		return -1;
2017 	(*t_p)++;
2018 	return 0;
2019 }
2020 
2021 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2022 			       struct tcmsg *tcm, struct netlink_callback *cb,
2023 			       int *t_p, int s_t)
2024 {
2025 	struct Qdisc *q;
2026 	int b;
2027 
2028 	if (!root)
2029 		return 0;
2030 
2031 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2032 		return -1;
2033 
2034 	if (!qdisc_dev(root))
2035 		return 0;
2036 
2037 	if (tcm->tcm_parent) {
2038 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2039 		if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2040 			return -1;
2041 		return 0;
2042 	}
2043 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2044 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2045 			return -1;
2046 	}
2047 
2048 	return 0;
2049 }
2050 
2051 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2052 {
2053 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2054 	struct net *net = sock_net(skb->sk);
2055 	struct netdev_queue *dev_queue;
2056 	struct net_device *dev;
2057 	int t, s_t;
2058 
2059 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2060 		return 0;
2061 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2062 	if (!dev)
2063 		return 0;
2064 
2065 	s_t = cb->args[0];
2066 	t = 0;
2067 
2068 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2069 		goto done;
2070 
2071 	dev_queue = dev_ingress_queue(dev);
2072 	if (dev_queue &&
2073 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2074 				&t, s_t) < 0)
2075 		goto done;
2076 
2077 done:
2078 	cb->args[0] = t;
2079 
2080 	dev_put(dev);
2081 	return skb->len;
2082 }
2083 
2084 #ifdef CONFIG_PROC_FS
2085 static int psched_show(struct seq_file *seq, void *v)
2086 {
2087 	seq_printf(seq, "%08x %08x %08x %08x\n",
2088 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2089 		   1000000,
2090 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2091 
2092 	return 0;
2093 }
2094 
2095 static int __net_init psched_net_init(struct net *net)
2096 {
2097 	struct proc_dir_entry *e;
2098 
2099 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2100 	if (e == NULL)
2101 		return -ENOMEM;
2102 
2103 	return 0;
2104 }
2105 
2106 static void __net_exit psched_net_exit(struct net *net)
2107 {
2108 	remove_proc_entry("psched", net->proc_net);
2109 }
2110 #else
2111 static int __net_init psched_net_init(struct net *net)
2112 {
2113 	return 0;
2114 }
2115 
2116 static void __net_exit psched_net_exit(struct net *net)
2117 {
2118 }
2119 #endif
2120 
2121 static struct pernet_operations psched_net_ops = {
2122 	.init = psched_net_init,
2123 	.exit = psched_net_exit,
2124 };
2125 
2126 static int __init pktsched_init(void)
2127 {
2128 	int err;
2129 
2130 	err = register_pernet_subsys(&psched_net_ops);
2131 	if (err) {
2132 		pr_err("pktsched_init: "
2133 		       "cannot initialize per netns operations\n");
2134 		return err;
2135 	}
2136 
2137 	register_qdisc(&pfifo_fast_ops);
2138 	register_qdisc(&pfifo_qdisc_ops);
2139 	register_qdisc(&bfifo_qdisc_ops);
2140 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2141 	register_qdisc(&mq_qdisc_ops);
2142 	register_qdisc(&noqueue_qdisc_ops);
2143 
2144 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2145 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2146 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2147 		      0);
2148 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2149 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2150 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2151 		      0);
2152 
2153 	return 0;
2154 }
2155 
2156 subsys_initcall(pktsched_init);
2157