xref: /openbmc/linux/net/sched/sch_api.c (revision f220d3eb)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
599 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600 				 clockid_t clockid)
601 {
602 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603 	wd->timer.function = qdisc_watchdog;
604 	wd->qdisc = qdisc;
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607 
608 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609 {
610 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611 }
612 EXPORT_SYMBOL(qdisc_watchdog_init);
613 
614 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615 {
616 	if (test_bit(__QDISC_STATE_DEACTIVATED,
617 		     &qdisc_root_sleeping(wd->qdisc)->state))
618 		return;
619 
620 	if (wd->last_expires == expires)
621 		return;
622 
623 	wd->last_expires = expires;
624 	hrtimer_start(&wd->timer,
625 		      ns_to_ktime(expires),
626 		      HRTIMER_MODE_ABS_PINNED);
627 }
628 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629 
630 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
631 {
632 	hrtimer_cancel(&wd->timer);
633 }
634 EXPORT_SYMBOL(qdisc_watchdog_cancel);
635 
636 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 {
638 	struct hlist_head *h;
639 	unsigned int i;
640 
641 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642 
643 	if (h != NULL) {
644 		for (i = 0; i < n; i++)
645 			INIT_HLIST_HEAD(&h[i]);
646 	}
647 	return h;
648 }
649 
650 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
651 {
652 	struct Qdisc_class_common *cl;
653 	struct hlist_node *next;
654 	struct hlist_head *nhash, *ohash;
655 	unsigned int nsize, nmask, osize;
656 	unsigned int i, h;
657 
658 	/* Rehash when load factor exceeds 0.75 */
659 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
660 		return;
661 	nsize = clhash->hashsize * 2;
662 	nmask = nsize - 1;
663 	nhash = qdisc_class_hash_alloc(nsize);
664 	if (nhash == NULL)
665 		return;
666 
667 	ohash = clhash->hash;
668 	osize = clhash->hashsize;
669 
670 	sch_tree_lock(sch);
671 	for (i = 0; i < osize; i++) {
672 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673 			h = qdisc_class_hash(cl->classid, nmask);
674 			hlist_add_head(&cl->hnode, &nhash[h]);
675 		}
676 	}
677 	clhash->hash     = nhash;
678 	clhash->hashsize = nsize;
679 	clhash->hashmask = nmask;
680 	sch_tree_unlock(sch);
681 
682 	kvfree(ohash);
683 }
684 EXPORT_SYMBOL(qdisc_class_hash_grow);
685 
686 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
687 {
688 	unsigned int size = 4;
689 
690 	clhash->hash = qdisc_class_hash_alloc(size);
691 	if (!clhash->hash)
692 		return -ENOMEM;
693 	clhash->hashsize  = size;
694 	clhash->hashmask  = size - 1;
695 	clhash->hashelems = 0;
696 	return 0;
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_init);
699 
700 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
701 {
702 	kvfree(clhash->hash);
703 }
704 EXPORT_SYMBOL(qdisc_class_hash_destroy);
705 
706 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
707 			     struct Qdisc_class_common *cl)
708 {
709 	unsigned int h;
710 
711 	INIT_HLIST_NODE(&cl->hnode);
712 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
713 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
714 	clhash->hashelems++;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_insert);
717 
718 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
719 			     struct Qdisc_class_common *cl)
720 {
721 	hlist_del(&cl->hnode);
722 	clhash->hashelems--;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_remove);
725 
726 /* Allocate an unique handle from space managed by kernel
727  * Possible range is [8000-FFFF]:0000 (0x8000 values)
728  */
729 static u32 qdisc_alloc_handle(struct net_device *dev)
730 {
731 	int i = 0x8000;
732 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
733 
734 	do {
735 		autohandle += TC_H_MAKE(0x10000U, 0);
736 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
737 			autohandle = TC_H_MAKE(0x80000000U, 0);
738 		if (!qdisc_lookup(dev, autohandle))
739 			return autohandle;
740 		cond_resched();
741 	} while	(--i > 0);
742 
743 	return 0;
744 }
745 
746 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
747 			       unsigned int len)
748 {
749 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750 	const struct Qdisc_class_ops *cops;
751 	unsigned long cl;
752 	u32 parentid;
753 	bool notify;
754 	int drops;
755 
756 	if (n == 0 && len == 0)
757 		return;
758 	drops = max_t(int, n, 0);
759 	rcu_read_lock();
760 	while ((parentid = sch->parent)) {
761 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762 			break;
763 
764 		if (sch->flags & TCQ_F_NOPARENT)
765 			break;
766 		/* Notify parent qdisc only if child qdisc becomes empty.
767 		 *
768 		 * If child was empty even before update then backlog
769 		 * counter is screwed and we skip notification because
770 		 * parent class is already passive.
771 		 *
772 		 * If the original child was offloaded then it is allowed
773 		 * to be seem as empty, so the parent is notified anyway.
774 		 */
775 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
776 						       !qdisc_is_offloaded);
777 		/* TODO: perform the search on a per txq basis */
778 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779 		if (sch == NULL) {
780 			WARN_ON_ONCE(parentid != TC_H_ROOT);
781 			break;
782 		}
783 		cops = sch->ops->cl_ops;
784 		if (notify && cops->qlen_notify) {
785 			cl = cops->find(sch, parentid);
786 			cops->qlen_notify(sch, cl);
787 		}
788 		sch->q.qlen -= n;
789 		sch->qstats.backlog -= len;
790 		__qdisc_qstats_drop(sch, drops);
791 	}
792 	rcu_read_unlock();
793 }
794 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
795 
796 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
797 			 u32 portid, u32 seq, u16 flags, int event)
798 {
799 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
800 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
801 	struct tcmsg *tcm;
802 	struct nlmsghdr  *nlh;
803 	unsigned char *b = skb_tail_pointer(skb);
804 	struct gnet_dump d;
805 	struct qdisc_size_table *stab;
806 	u32 block_index;
807 	__u32 qlen;
808 
809 	cond_resched();
810 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
811 	if (!nlh)
812 		goto out_nlmsg_trim;
813 	tcm = nlmsg_data(nlh);
814 	tcm->tcm_family = AF_UNSPEC;
815 	tcm->tcm__pad1 = 0;
816 	tcm->tcm__pad2 = 0;
817 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
818 	tcm->tcm_parent = clid;
819 	tcm->tcm_handle = q->handle;
820 	tcm->tcm_info = refcount_read(&q->refcnt);
821 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
822 		goto nla_put_failure;
823 	if (q->ops->ingress_block_get) {
824 		block_index = q->ops->ingress_block_get(q);
825 		if (block_index &&
826 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
827 			goto nla_put_failure;
828 	}
829 	if (q->ops->egress_block_get) {
830 		block_index = q->ops->egress_block_get(q);
831 		if (block_index &&
832 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
833 			goto nla_put_failure;
834 	}
835 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
836 		goto nla_put_failure;
837 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
838 		goto nla_put_failure;
839 	qlen = qdisc_qlen_sum(q);
840 
841 	stab = rtnl_dereference(q->stab);
842 	if (stab && qdisc_dump_stab(skb, stab) < 0)
843 		goto nla_put_failure;
844 
845 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
846 					 NULL, &d, TCA_PAD) < 0)
847 		goto nla_put_failure;
848 
849 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
850 		goto nla_put_failure;
851 
852 	if (qdisc_is_percpu_stats(q)) {
853 		cpu_bstats = q->cpu_bstats;
854 		cpu_qstats = q->cpu_qstats;
855 	}
856 
857 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
858 				  &d, cpu_bstats, &q->bstats) < 0 ||
859 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
860 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
861 		goto nla_put_failure;
862 
863 	if (gnet_stats_finish_copy(&d) < 0)
864 		goto nla_put_failure;
865 
866 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
867 	return skb->len;
868 
869 out_nlmsg_trim:
870 nla_put_failure:
871 	nlmsg_trim(skb, b);
872 	return -1;
873 }
874 
875 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
876 {
877 	if (q->flags & TCQ_F_BUILTIN)
878 		return true;
879 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
880 		return true;
881 
882 	return false;
883 }
884 
885 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
886 			struct nlmsghdr *n, u32 clid,
887 			struct Qdisc *old, struct Qdisc *new)
888 {
889 	struct sk_buff *skb;
890 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
891 
892 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
893 	if (!skb)
894 		return -ENOBUFS;
895 
896 	if (old && !tc_qdisc_dump_ignore(old, false)) {
897 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
898 				  0, RTM_DELQDISC) < 0)
899 			goto err_out;
900 	}
901 	if (new && !tc_qdisc_dump_ignore(new, false)) {
902 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
903 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
904 			goto err_out;
905 	}
906 
907 	if (skb->len)
908 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
909 				      n->nlmsg_flags & NLM_F_ECHO);
910 
911 err_out:
912 	kfree_skb(skb);
913 	return -EINVAL;
914 }
915 
916 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
917 			       struct nlmsghdr *n, u32 clid,
918 			       struct Qdisc *old, struct Qdisc *new)
919 {
920 	if (new || old)
921 		qdisc_notify(net, skb, n, clid, old, new);
922 
923 	if (old)
924 		qdisc_destroy(old);
925 }
926 
927 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
928  * to device "dev".
929  *
930  * When appropriate send a netlink notification using 'skb'
931  * and "n".
932  *
933  * On success, destroy old qdisc.
934  */
935 
936 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938 		       struct Qdisc *new, struct Qdisc *old,
939 		       struct netlink_ext_ack *extack)
940 {
941 	struct Qdisc *q = old;
942 	struct net *net = dev_net(dev);
943 	int err = 0;
944 
945 	if (parent == NULL) {
946 		unsigned int i, num_q, ingress;
947 
948 		ingress = 0;
949 		num_q = dev->num_tx_queues;
950 		if ((q && q->flags & TCQ_F_INGRESS) ||
951 		    (new && new->flags & TCQ_F_INGRESS)) {
952 			num_q = 1;
953 			ingress = 1;
954 			if (!dev_ingress_queue(dev)) {
955 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956 				return -ENOENT;
957 			}
958 		}
959 
960 		if (dev->flags & IFF_UP)
961 			dev_deactivate(dev);
962 
963 		if (new && new->ops->attach)
964 			goto skip;
965 
966 		for (i = 0; i < num_q; i++) {
967 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968 
969 			if (!ingress)
970 				dev_queue = netdev_get_tx_queue(dev, i);
971 
972 			old = dev_graft_qdisc(dev_queue, new);
973 			if (new && i > 0)
974 				qdisc_refcount_inc(new);
975 
976 			if (!ingress)
977 				qdisc_destroy(old);
978 		}
979 
980 skip:
981 		if (!ingress) {
982 			notify_and_destroy(net, skb, n, classid,
983 					   dev->qdisc, new);
984 			if (new && !new->ops->attach)
985 				qdisc_refcount_inc(new);
986 			dev->qdisc = new ? : &noop_qdisc;
987 
988 			if (new && new->ops->attach)
989 				new->ops->attach(new);
990 		} else {
991 			notify_and_destroy(net, skb, n, classid, old, new);
992 		}
993 
994 		if (dev->flags & IFF_UP)
995 			dev_activate(dev);
996 	} else {
997 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
998 
999 		/* Only support running class lockless if parent is lockless */
1000 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1001 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1002 			new->flags &= ~TCQ_F_NOLOCK;
1003 
1004 		err = -EOPNOTSUPP;
1005 		if (cops && cops->graft) {
1006 			unsigned long cl = cops->find(parent, classid);
1007 
1008 			if (cl) {
1009 				err = cops->graft(parent, cl, new, &old,
1010 						  extack);
1011 			} else {
1012 				NL_SET_ERR_MSG(extack, "Specified class not found");
1013 				err = -ENOENT;
1014 			}
1015 		}
1016 		if (!err)
1017 			notify_and_destroy(net, skb, n, classid, old, new);
1018 	}
1019 	return err;
1020 }
1021 
1022 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1023 				   struct netlink_ext_ack *extack)
1024 {
1025 	u32 block_index;
1026 
1027 	if (tca[TCA_INGRESS_BLOCK]) {
1028 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1029 
1030 		if (!block_index) {
1031 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1032 			return -EINVAL;
1033 		}
1034 		if (!sch->ops->ingress_block_set) {
1035 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1036 			return -EOPNOTSUPP;
1037 		}
1038 		sch->ops->ingress_block_set(sch, block_index);
1039 	}
1040 	if (tca[TCA_EGRESS_BLOCK]) {
1041 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1042 
1043 		if (!block_index) {
1044 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1045 			return -EINVAL;
1046 		}
1047 		if (!sch->ops->egress_block_set) {
1048 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1049 			return -EOPNOTSUPP;
1050 		}
1051 		sch->ops->egress_block_set(sch, block_index);
1052 	}
1053 	return 0;
1054 }
1055 
1056 /* lockdep annotation is needed for ingress; egress gets it only for name */
1057 static struct lock_class_key qdisc_tx_lock;
1058 static struct lock_class_key qdisc_rx_lock;
1059 
1060 /*
1061    Allocate and initialize new qdisc.
1062 
1063    Parameters are passed via opt.
1064  */
1065 
1066 static struct Qdisc *qdisc_create(struct net_device *dev,
1067 				  struct netdev_queue *dev_queue,
1068 				  struct Qdisc *p, u32 parent, u32 handle,
1069 				  struct nlattr **tca, int *errp,
1070 				  struct netlink_ext_ack *extack)
1071 {
1072 	int err;
1073 	struct nlattr *kind = tca[TCA_KIND];
1074 	struct Qdisc *sch;
1075 	struct Qdisc_ops *ops;
1076 	struct qdisc_size_table *stab;
1077 
1078 	ops = qdisc_lookup_ops(kind);
1079 #ifdef CONFIG_MODULES
1080 	if (ops == NULL && kind != NULL) {
1081 		char name[IFNAMSIZ];
1082 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1083 			/* We dropped the RTNL semaphore in order to
1084 			 * perform the module load.  So, even if we
1085 			 * succeeded in loading the module we have to
1086 			 * tell the caller to replay the request.  We
1087 			 * indicate this using -EAGAIN.
1088 			 * We replay the request because the device may
1089 			 * go away in the mean time.
1090 			 */
1091 			rtnl_unlock();
1092 			request_module("sch_%s", name);
1093 			rtnl_lock();
1094 			ops = qdisc_lookup_ops(kind);
1095 			if (ops != NULL) {
1096 				/* We will try again qdisc_lookup_ops,
1097 				 * so don't keep a reference.
1098 				 */
1099 				module_put(ops->owner);
1100 				err = -EAGAIN;
1101 				goto err_out;
1102 			}
1103 		}
1104 	}
1105 #endif
1106 
1107 	err = -ENOENT;
1108 	if (!ops) {
1109 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1110 		goto err_out;
1111 	}
1112 
1113 	sch = qdisc_alloc(dev_queue, ops, extack);
1114 	if (IS_ERR(sch)) {
1115 		err = PTR_ERR(sch);
1116 		goto err_out2;
1117 	}
1118 
1119 	sch->parent = parent;
1120 
1121 	if (handle == TC_H_INGRESS) {
1122 		sch->flags |= TCQ_F_INGRESS;
1123 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1124 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1125 	} else {
1126 		if (handle == 0) {
1127 			handle = qdisc_alloc_handle(dev);
1128 			err = -ENOMEM;
1129 			if (handle == 0)
1130 				goto err_out3;
1131 		}
1132 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1133 		if (!netif_is_multiqueue(dev))
1134 			sch->flags |= TCQ_F_ONETXQUEUE;
1135 	}
1136 
1137 	sch->handle = handle;
1138 
1139 	/* This exist to keep backward compatible with a userspace
1140 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1141 	 * facility on older kernels by setting tx_queue_len=0 (prior
1142 	 * to qdisc init), and then forgot to reinit tx_queue_len
1143 	 * before again attaching a qdisc.
1144 	 */
1145 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1146 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1147 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1148 	}
1149 
1150 	err = qdisc_block_indexes_set(sch, tca, extack);
1151 	if (err)
1152 		goto err_out3;
1153 
1154 	if (ops->init) {
1155 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1156 		if (err != 0)
1157 			goto err_out5;
1158 	}
1159 
1160 	if (tca[TCA_STAB]) {
1161 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1162 		if (IS_ERR(stab)) {
1163 			err = PTR_ERR(stab);
1164 			goto err_out4;
1165 		}
1166 		rcu_assign_pointer(sch->stab, stab);
1167 	}
1168 	if (tca[TCA_RATE]) {
1169 		seqcount_t *running;
1170 
1171 		err = -EOPNOTSUPP;
1172 		if (sch->flags & TCQ_F_MQROOT) {
1173 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1174 			goto err_out4;
1175 		}
1176 
1177 		if (sch->parent != TC_H_ROOT &&
1178 		    !(sch->flags & TCQ_F_INGRESS) &&
1179 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1180 			running = qdisc_root_sleeping_running(sch);
1181 		else
1182 			running = &sch->running;
1183 
1184 		err = gen_new_estimator(&sch->bstats,
1185 					sch->cpu_bstats,
1186 					&sch->rate_est,
1187 					NULL,
1188 					running,
1189 					tca[TCA_RATE]);
1190 		if (err) {
1191 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1192 			goto err_out4;
1193 		}
1194 	}
1195 
1196 	qdisc_hash_add(sch, false);
1197 
1198 	return sch;
1199 
1200 err_out5:
1201 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1202 	if (ops->destroy)
1203 		ops->destroy(sch);
1204 err_out3:
1205 	dev_put(dev);
1206 	qdisc_free(sch);
1207 err_out2:
1208 	module_put(ops->owner);
1209 err_out:
1210 	*errp = err;
1211 	return NULL;
1212 
1213 err_out4:
1214 	/*
1215 	 * Any broken qdiscs that would require a ops->reset() here?
1216 	 * The qdisc was never in action so it shouldn't be necessary.
1217 	 */
1218 	qdisc_put_stab(rtnl_dereference(sch->stab));
1219 	if (ops->destroy)
1220 		ops->destroy(sch);
1221 	goto err_out3;
1222 }
1223 
1224 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1225 			struct netlink_ext_ack *extack)
1226 {
1227 	struct qdisc_size_table *ostab, *stab = NULL;
1228 	int err = 0;
1229 
1230 	if (tca[TCA_OPTIONS]) {
1231 		if (!sch->ops->change) {
1232 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1233 			return -EINVAL;
1234 		}
1235 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1236 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1237 			return -EOPNOTSUPP;
1238 		}
1239 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1240 		if (err)
1241 			return err;
1242 	}
1243 
1244 	if (tca[TCA_STAB]) {
1245 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1246 		if (IS_ERR(stab))
1247 			return PTR_ERR(stab);
1248 	}
1249 
1250 	ostab = rtnl_dereference(sch->stab);
1251 	rcu_assign_pointer(sch->stab, stab);
1252 	qdisc_put_stab(ostab);
1253 
1254 	if (tca[TCA_RATE]) {
1255 		/* NB: ignores errors from replace_estimator
1256 		   because change can't be undone. */
1257 		if (sch->flags & TCQ_F_MQROOT)
1258 			goto out;
1259 		gen_replace_estimator(&sch->bstats,
1260 				      sch->cpu_bstats,
1261 				      &sch->rate_est,
1262 				      NULL,
1263 				      qdisc_root_sleeping_running(sch),
1264 				      tca[TCA_RATE]);
1265 	}
1266 out:
1267 	return 0;
1268 }
1269 
1270 struct check_loop_arg {
1271 	struct qdisc_walker	w;
1272 	struct Qdisc		*p;
1273 	int			depth;
1274 };
1275 
1276 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1277 			 struct qdisc_walker *w);
1278 
1279 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1280 {
1281 	struct check_loop_arg	arg;
1282 
1283 	if (q->ops->cl_ops == NULL)
1284 		return 0;
1285 
1286 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1287 	arg.w.fn = check_loop_fn;
1288 	arg.depth = depth;
1289 	arg.p = p;
1290 	q->ops->cl_ops->walk(q, &arg.w);
1291 	return arg.w.stop ? -ELOOP : 0;
1292 }
1293 
1294 static int
1295 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1296 {
1297 	struct Qdisc *leaf;
1298 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1299 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1300 
1301 	leaf = cops->leaf(q, cl);
1302 	if (leaf) {
1303 		if (leaf == arg->p || arg->depth > 7)
1304 			return -ELOOP;
1305 		return check_loop(leaf, arg->p, arg->depth + 1);
1306 	}
1307 	return 0;
1308 }
1309 
1310 /*
1311  * Delete/get qdisc.
1312  */
1313 
1314 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1315 			struct netlink_ext_ack *extack)
1316 {
1317 	struct net *net = sock_net(skb->sk);
1318 	struct tcmsg *tcm = nlmsg_data(n);
1319 	struct nlattr *tca[TCA_MAX + 1];
1320 	struct net_device *dev;
1321 	u32 clid;
1322 	struct Qdisc *q = NULL;
1323 	struct Qdisc *p = NULL;
1324 	int err;
1325 
1326 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1327 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1328 		return -EPERM;
1329 
1330 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1331 	if (err < 0)
1332 		return err;
1333 
1334 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1335 	if (!dev)
1336 		return -ENODEV;
1337 
1338 	clid = tcm->tcm_parent;
1339 	if (clid) {
1340 		if (clid != TC_H_ROOT) {
1341 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1342 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1343 				if (!p) {
1344 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1345 					return -ENOENT;
1346 				}
1347 				q = qdisc_leaf(p, clid);
1348 			} else if (dev_ingress_queue(dev)) {
1349 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1350 			}
1351 		} else {
1352 			q = dev->qdisc;
1353 		}
1354 		if (!q) {
1355 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1356 			return -ENOENT;
1357 		}
1358 
1359 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1360 			NL_SET_ERR_MSG(extack, "Invalid handle");
1361 			return -EINVAL;
1362 		}
1363 	} else {
1364 		q = qdisc_lookup(dev, tcm->tcm_handle);
1365 		if (!q) {
1366 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1367 			return -ENOENT;
1368 		}
1369 	}
1370 
1371 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1372 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1373 		return -EINVAL;
1374 	}
1375 
1376 	if (n->nlmsg_type == RTM_DELQDISC) {
1377 		if (!clid) {
1378 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1379 			return -EINVAL;
1380 		}
1381 		if (q->handle == 0) {
1382 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1383 			return -ENOENT;
1384 		}
1385 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1386 		if (err != 0)
1387 			return err;
1388 	} else {
1389 		qdisc_notify(net, skb, n, clid, NULL, q);
1390 	}
1391 	return 0;
1392 }
1393 
1394 /*
1395  * Create/change qdisc.
1396  */
1397 
1398 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1399 			   struct netlink_ext_ack *extack)
1400 {
1401 	struct net *net = sock_net(skb->sk);
1402 	struct tcmsg *tcm;
1403 	struct nlattr *tca[TCA_MAX + 1];
1404 	struct net_device *dev;
1405 	u32 clid;
1406 	struct Qdisc *q, *p;
1407 	int err;
1408 
1409 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1410 		return -EPERM;
1411 
1412 replay:
1413 	/* Reinit, just in case something touches this. */
1414 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1415 	if (err < 0)
1416 		return err;
1417 
1418 	tcm = nlmsg_data(n);
1419 	clid = tcm->tcm_parent;
1420 	q = p = NULL;
1421 
1422 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1423 	if (!dev)
1424 		return -ENODEV;
1425 
1426 
1427 	if (clid) {
1428 		if (clid != TC_H_ROOT) {
1429 			if (clid != TC_H_INGRESS) {
1430 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1431 				if (!p) {
1432 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1433 					return -ENOENT;
1434 				}
1435 				q = qdisc_leaf(p, clid);
1436 			} else if (dev_ingress_queue_create(dev)) {
1437 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1438 			}
1439 		} else {
1440 			q = dev->qdisc;
1441 		}
1442 
1443 		/* It may be default qdisc, ignore it */
1444 		if (q && q->handle == 0)
1445 			q = NULL;
1446 
1447 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1448 			if (tcm->tcm_handle) {
1449 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1450 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1451 					return -EEXIST;
1452 				}
1453 				if (TC_H_MIN(tcm->tcm_handle)) {
1454 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1455 					return -EINVAL;
1456 				}
1457 				q = qdisc_lookup(dev, tcm->tcm_handle);
1458 				if (!q)
1459 					goto create_n_graft;
1460 				if (n->nlmsg_flags & NLM_F_EXCL) {
1461 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1462 					return -EEXIST;
1463 				}
1464 				if (tca[TCA_KIND] &&
1465 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1466 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1467 					return -EINVAL;
1468 				}
1469 				if (q == p ||
1470 				    (p && check_loop(q, p, 0))) {
1471 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1472 					return -ELOOP;
1473 				}
1474 				qdisc_refcount_inc(q);
1475 				goto graft;
1476 			} else {
1477 				if (!q)
1478 					goto create_n_graft;
1479 
1480 				/* This magic test requires explanation.
1481 				 *
1482 				 *   We know, that some child q is already
1483 				 *   attached to this parent and have choice:
1484 				 *   either to change it or to create/graft new one.
1485 				 *
1486 				 *   1. We are allowed to create/graft only
1487 				 *   if CREATE and REPLACE flags are set.
1488 				 *
1489 				 *   2. If EXCL is set, requestor wanted to say,
1490 				 *   that qdisc tcm_handle is not expected
1491 				 *   to exist, so that we choose create/graft too.
1492 				 *
1493 				 *   3. The last case is when no flags are set.
1494 				 *   Alas, it is sort of hole in API, we
1495 				 *   cannot decide what to do unambiguously.
1496 				 *   For now we select create/graft, if
1497 				 *   user gave KIND, which does not match existing.
1498 				 */
1499 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1500 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1501 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1502 				     (tca[TCA_KIND] &&
1503 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1504 					goto create_n_graft;
1505 			}
1506 		}
1507 	} else {
1508 		if (!tcm->tcm_handle) {
1509 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1510 			return -EINVAL;
1511 		}
1512 		q = qdisc_lookup(dev, tcm->tcm_handle);
1513 	}
1514 
1515 	/* Change qdisc parameters */
1516 	if (!q) {
1517 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1518 		return -ENOENT;
1519 	}
1520 	if (n->nlmsg_flags & NLM_F_EXCL) {
1521 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1522 		return -EEXIST;
1523 	}
1524 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1525 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1526 		return -EINVAL;
1527 	}
1528 	err = qdisc_change(q, tca, extack);
1529 	if (err == 0)
1530 		qdisc_notify(net, skb, n, clid, NULL, q);
1531 	return err;
1532 
1533 create_n_graft:
1534 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1535 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1536 		return -ENOENT;
1537 	}
1538 	if (clid == TC_H_INGRESS) {
1539 		if (dev_ingress_queue(dev)) {
1540 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1541 					 tcm->tcm_parent, tcm->tcm_parent,
1542 					 tca, &err, extack);
1543 		} else {
1544 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1545 			err = -ENOENT;
1546 		}
1547 	} else {
1548 		struct netdev_queue *dev_queue;
1549 
1550 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1551 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1552 		else if (p)
1553 			dev_queue = p->dev_queue;
1554 		else
1555 			dev_queue = netdev_get_tx_queue(dev, 0);
1556 
1557 		q = qdisc_create(dev, dev_queue, p,
1558 				 tcm->tcm_parent, tcm->tcm_handle,
1559 				 tca, &err, extack);
1560 	}
1561 	if (q == NULL) {
1562 		if (err == -EAGAIN)
1563 			goto replay;
1564 		return err;
1565 	}
1566 
1567 graft:
1568 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1569 	if (err) {
1570 		if (q)
1571 			qdisc_destroy(q);
1572 		return err;
1573 	}
1574 
1575 	return 0;
1576 }
1577 
1578 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1579 			      struct netlink_callback *cb,
1580 			      int *q_idx_p, int s_q_idx, bool recur,
1581 			      bool dump_invisible)
1582 {
1583 	int ret = 0, q_idx = *q_idx_p;
1584 	struct Qdisc *q;
1585 	int b;
1586 
1587 	if (!root)
1588 		return 0;
1589 
1590 	q = root;
1591 	if (q_idx < s_q_idx) {
1592 		q_idx++;
1593 	} else {
1594 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1595 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1596 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1597 				  RTM_NEWQDISC) <= 0)
1598 			goto done;
1599 		q_idx++;
1600 	}
1601 
1602 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1603 	 * itself has already been dumped.
1604 	 *
1605 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1606 	 * qdisc hashtable, we don't want to hit it again
1607 	 */
1608 	if (!qdisc_dev(root) || !recur)
1609 		goto out;
1610 
1611 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1612 		if (q_idx < s_q_idx) {
1613 			q_idx++;
1614 			continue;
1615 		}
1616 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1617 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1618 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1619 				  RTM_NEWQDISC) <= 0)
1620 			goto done;
1621 		q_idx++;
1622 	}
1623 
1624 out:
1625 	*q_idx_p = q_idx;
1626 	return ret;
1627 done:
1628 	ret = -1;
1629 	goto out;
1630 }
1631 
1632 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1633 {
1634 	struct net *net = sock_net(skb->sk);
1635 	int idx, q_idx;
1636 	int s_idx, s_q_idx;
1637 	struct net_device *dev;
1638 	const struct nlmsghdr *nlh = cb->nlh;
1639 	struct nlattr *tca[TCA_MAX + 1];
1640 	int err;
1641 
1642 	s_idx = cb->args[0];
1643 	s_q_idx = q_idx = cb->args[1];
1644 
1645 	idx = 0;
1646 	ASSERT_RTNL();
1647 
1648 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1649 	if (err < 0)
1650 		return err;
1651 
1652 	for_each_netdev(net, dev) {
1653 		struct netdev_queue *dev_queue;
1654 
1655 		if (idx < s_idx)
1656 			goto cont;
1657 		if (idx > s_idx)
1658 			s_q_idx = 0;
1659 		q_idx = 0;
1660 
1661 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1662 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1663 			goto done;
1664 
1665 		dev_queue = dev_ingress_queue(dev);
1666 		if (dev_queue &&
1667 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1668 				       &q_idx, s_q_idx, false,
1669 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1670 			goto done;
1671 
1672 cont:
1673 		idx++;
1674 	}
1675 
1676 done:
1677 	cb->args[0] = idx;
1678 	cb->args[1] = q_idx;
1679 
1680 	return skb->len;
1681 }
1682 
1683 
1684 
1685 /************************************************
1686  *	Traffic classes manipulation.		*
1687  ************************************************/
1688 
1689 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1690 			  unsigned long cl,
1691 			  u32 portid, u32 seq, u16 flags, int event)
1692 {
1693 	struct tcmsg *tcm;
1694 	struct nlmsghdr  *nlh;
1695 	unsigned char *b = skb_tail_pointer(skb);
1696 	struct gnet_dump d;
1697 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1698 
1699 	cond_resched();
1700 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1701 	if (!nlh)
1702 		goto out_nlmsg_trim;
1703 	tcm = nlmsg_data(nlh);
1704 	tcm->tcm_family = AF_UNSPEC;
1705 	tcm->tcm__pad1 = 0;
1706 	tcm->tcm__pad2 = 0;
1707 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1708 	tcm->tcm_parent = q->handle;
1709 	tcm->tcm_handle = q->handle;
1710 	tcm->tcm_info = 0;
1711 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1712 		goto nla_put_failure;
1713 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1714 		goto nla_put_failure;
1715 
1716 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1717 					 NULL, &d, TCA_PAD) < 0)
1718 		goto nla_put_failure;
1719 
1720 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1721 		goto nla_put_failure;
1722 
1723 	if (gnet_stats_finish_copy(&d) < 0)
1724 		goto nla_put_failure;
1725 
1726 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1727 	return skb->len;
1728 
1729 out_nlmsg_trim:
1730 nla_put_failure:
1731 	nlmsg_trim(skb, b);
1732 	return -1;
1733 }
1734 
1735 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1736 			 struct nlmsghdr *n, struct Qdisc *q,
1737 			 unsigned long cl, int event)
1738 {
1739 	struct sk_buff *skb;
1740 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1741 
1742 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1743 	if (!skb)
1744 		return -ENOBUFS;
1745 
1746 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1747 		kfree_skb(skb);
1748 		return -EINVAL;
1749 	}
1750 
1751 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1752 			      n->nlmsg_flags & NLM_F_ECHO);
1753 }
1754 
1755 static int tclass_del_notify(struct net *net,
1756 			     const struct Qdisc_class_ops *cops,
1757 			     struct sk_buff *oskb, struct nlmsghdr *n,
1758 			     struct Qdisc *q, unsigned long cl)
1759 {
1760 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1761 	struct sk_buff *skb;
1762 	int err = 0;
1763 
1764 	if (!cops->delete)
1765 		return -EOPNOTSUPP;
1766 
1767 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1768 	if (!skb)
1769 		return -ENOBUFS;
1770 
1771 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1772 			   RTM_DELTCLASS) < 0) {
1773 		kfree_skb(skb);
1774 		return -EINVAL;
1775 	}
1776 
1777 	err = cops->delete(q, cl);
1778 	if (err) {
1779 		kfree_skb(skb);
1780 		return err;
1781 	}
1782 
1783 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1784 			      n->nlmsg_flags & NLM_F_ECHO);
1785 }
1786 
1787 #ifdef CONFIG_NET_CLS
1788 
1789 struct tcf_bind_args {
1790 	struct tcf_walker w;
1791 	u32 classid;
1792 	unsigned long cl;
1793 };
1794 
1795 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1796 {
1797 	struct tcf_bind_args *a = (void *)arg;
1798 
1799 	if (tp->ops->bind_class) {
1800 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1801 
1802 		sch_tree_lock(q);
1803 		tp->ops->bind_class(n, a->classid, a->cl);
1804 		sch_tree_unlock(q);
1805 	}
1806 	return 0;
1807 }
1808 
1809 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1810 			   unsigned long new_cl)
1811 {
1812 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1813 	struct tcf_block *block;
1814 	struct tcf_chain *chain;
1815 	unsigned long cl;
1816 
1817 	cl = cops->find(q, portid);
1818 	if (!cl)
1819 		return;
1820 	block = cops->tcf_block(q, cl, NULL);
1821 	if (!block)
1822 		return;
1823 	list_for_each_entry(chain, &block->chain_list, list) {
1824 		struct tcf_proto *tp;
1825 
1826 		for (tp = rtnl_dereference(chain->filter_chain);
1827 		     tp; tp = rtnl_dereference(tp->next)) {
1828 			struct tcf_bind_args arg = {};
1829 
1830 			arg.w.fn = tcf_node_bind;
1831 			arg.classid = clid;
1832 			arg.cl = new_cl;
1833 			tp->ops->walk(tp, &arg.w);
1834 		}
1835 	}
1836 }
1837 
1838 #else
1839 
1840 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1841 			   unsigned long new_cl)
1842 {
1843 }
1844 
1845 #endif
1846 
1847 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1848 			 struct netlink_ext_ack *extack)
1849 {
1850 	struct net *net = sock_net(skb->sk);
1851 	struct tcmsg *tcm = nlmsg_data(n);
1852 	struct nlattr *tca[TCA_MAX + 1];
1853 	struct net_device *dev;
1854 	struct Qdisc *q = NULL;
1855 	const struct Qdisc_class_ops *cops;
1856 	unsigned long cl = 0;
1857 	unsigned long new_cl;
1858 	u32 portid;
1859 	u32 clid;
1860 	u32 qid;
1861 	int err;
1862 
1863 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1864 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1865 		return -EPERM;
1866 
1867 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1868 	if (err < 0)
1869 		return err;
1870 
1871 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1872 	if (!dev)
1873 		return -ENODEV;
1874 
1875 	/*
1876 	   parent == TC_H_UNSPEC - unspecified parent.
1877 	   parent == TC_H_ROOT   - class is root, which has no parent.
1878 	   parent == X:0	 - parent is root class.
1879 	   parent == X:Y	 - parent is a node in hierarchy.
1880 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1881 
1882 	   handle == 0:0	 - generate handle from kernel pool.
1883 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1884 	   handle == X:Y	 - clear.
1885 	   handle == X:0	 - root class.
1886 	 */
1887 
1888 	/* Step 1. Determine qdisc handle X:0 */
1889 
1890 	portid = tcm->tcm_parent;
1891 	clid = tcm->tcm_handle;
1892 	qid = TC_H_MAJ(clid);
1893 
1894 	if (portid != TC_H_ROOT) {
1895 		u32 qid1 = TC_H_MAJ(portid);
1896 
1897 		if (qid && qid1) {
1898 			/* If both majors are known, they must be identical. */
1899 			if (qid != qid1)
1900 				return -EINVAL;
1901 		} else if (qid1) {
1902 			qid = qid1;
1903 		} else if (qid == 0)
1904 			qid = dev->qdisc->handle;
1905 
1906 		/* Now qid is genuine qdisc handle consistent
1907 		 * both with parent and child.
1908 		 *
1909 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1910 		 */
1911 		if (portid)
1912 			portid = TC_H_MAKE(qid, portid);
1913 	} else {
1914 		if (qid == 0)
1915 			qid = dev->qdisc->handle;
1916 	}
1917 
1918 	/* OK. Locate qdisc */
1919 	q = qdisc_lookup(dev, qid);
1920 	if (!q)
1921 		return -ENOENT;
1922 
1923 	/* An check that it supports classes */
1924 	cops = q->ops->cl_ops;
1925 	if (cops == NULL)
1926 		return -EINVAL;
1927 
1928 	/* Now try to get class */
1929 	if (clid == 0) {
1930 		if (portid == TC_H_ROOT)
1931 			clid = qid;
1932 	} else
1933 		clid = TC_H_MAKE(qid, clid);
1934 
1935 	if (clid)
1936 		cl = cops->find(q, clid);
1937 
1938 	if (cl == 0) {
1939 		err = -ENOENT;
1940 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1941 		    !(n->nlmsg_flags & NLM_F_CREATE))
1942 			goto out;
1943 	} else {
1944 		switch (n->nlmsg_type) {
1945 		case RTM_NEWTCLASS:
1946 			err = -EEXIST;
1947 			if (n->nlmsg_flags & NLM_F_EXCL)
1948 				goto out;
1949 			break;
1950 		case RTM_DELTCLASS:
1951 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1952 			/* Unbind the class with flilters with 0 */
1953 			tc_bind_tclass(q, portid, clid, 0);
1954 			goto out;
1955 		case RTM_GETTCLASS:
1956 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1957 			goto out;
1958 		default:
1959 			err = -EINVAL;
1960 			goto out;
1961 		}
1962 	}
1963 
1964 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1965 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1966 		return -EOPNOTSUPP;
1967 	}
1968 
1969 	new_cl = cl;
1970 	err = -EOPNOTSUPP;
1971 	if (cops->change)
1972 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1973 	if (err == 0) {
1974 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1975 		/* We just create a new class, need to do reverse binding. */
1976 		if (cl != new_cl)
1977 			tc_bind_tclass(q, portid, clid, new_cl);
1978 	}
1979 out:
1980 	return err;
1981 }
1982 
1983 struct qdisc_dump_args {
1984 	struct qdisc_walker	w;
1985 	struct sk_buff		*skb;
1986 	struct netlink_callback	*cb;
1987 };
1988 
1989 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1990 			    struct qdisc_walker *arg)
1991 {
1992 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1993 
1994 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1995 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1996 			      RTM_NEWTCLASS);
1997 }
1998 
1999 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2000 				struct tcmsg *tcm, struct netlink_callback *cb,
2001 				int *t_p, int s_t)
2002 {
2003 	struct qdisc_dump_args arg;
2004 
2005 	if (tc_qdisc_dump_ignore(q, false) ||
2006 	    *t_p < s_t || !q->ops->cl_ops ||
2007 	    (tcm->tcm_parent &&
2008 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2009 		(*t_p)++;
2010 		return 0;
2011 	}
2012 	if (*t_p > s_t)
2013 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2014 	arg.w.fn = qdisc_class_dump;
2015 	arg.skb = skb;
2016 	arg.cb = cb;
2017 	arg.w.stop  = 0;
2018 	arg.w.skip = cb->args[1];
2019 	arg.w.count = 0;
2020 	q->ops->cl_ops->walk(q, &arg.w);
2021 	cb->args[1] = arg.w.count;
2022 	if (arg.w.stop)
2023 		return -1;
2024 	(*t_p)++;
2025 	return 0;
2026 }
2027 
2028 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2029 			       struct tcmsg *tcm, struct netlink_callback *cb,
2030 			       int *t_p, int s_t)
2031 {
2032 	struct Qdisc *q;
2033 	int b;
2034 
2035 	if (!root)
2036 		return 0;
2037 
2038 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2039 		return -1;
2040 
2041 	if (!qdisc_dev(root))
2042 		return 0;
2043 
2044 	if (tcm->tcm_parent) {
2045 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2046 		if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2047 			return -1;
2048 		return 0;
2049 	}
2050 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2051 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2052 			return -1;
2053 	}
2054 
2055 	return 0;
2056 }
2057 
2058 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2059 {
2060 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2061 	struct net *net = sock_net(skb->sk);
2062 	struct netdev_queue *dev_queue;
2063 	struct net_device *dev;
2064 	int t, s_t;
2065 
2066 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2067 		return 0;
2068 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2069 	if (!dev)
2070 		return 0;
2071 
2072 	s_t = cb->args[0];
2073 	t = 0;
2074 
2075 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2076 		goto done;
2077 
2078 	dev_queue = dev_ingress_queue(dev);
2079 	if (dev_queue &&
2080 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2081 				&t, s_t) < 0)
2082 		goto done;
2083 
2084 done:
2085 	cb->args[0] = t;
2086 
2087 	dev_put(dev);
2088 	return skb->len;
2089 }
2090 
2091 #ifdef CONFIG_PROC_FS
2092 static int psched_show(struct seq_file *seq, void *v)
2093 {
2094 	seq_printf(seq, "%08x %08x %08x %08x\n",
2095 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2096 		   1000000,
2097 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2098 
2099 	return 0;
2100 }
2101 
2102 static int __net_init psched_net_init(struct net *net)
2103 {
2104 	struct proc_dir_entry *e;
2105 
2106 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2107 	if (e == NULL)
2108 		return -ENOMEM;
2109 
2110 	return 0;
2111 }
2112 
2113 static void __net_exit psched_net_exit(struct net *net)
2114 {
2115 	remove_proc_entry("psched", net->proc_net);
2116 }
2117 #else
2118 static int __net_init psched_net_init(struct net *net)
2119 {
2120 	return 0;
2121 }
2122 
2123 static void __net_exit psched_net_exit(struct net *net)
2124 {
2125 }
2126 #endif
2127 
2128 static struct pernet_operations psched_net_ops = {
2129 	.init = psched_net_init,
2130 	.exit = psched_net_exit,
2131 };
2132 
2133 static int __init pktsched_init(void)
2134 {
2135 	int err;
2136 
2137 	err = register_pernet_subsys(&psched_net_ops);
2138 	if (err) {
2139 		pr_err("pktsched_init: "
2140 		       "cannot initialize per netns operations\n");
2141 		return err;
2142 	}
2143 
2144 	register_qdisc(&pfifo_fast_ops);
2145 	register_qdisc(&pfifo_qdisc_ops);
2146 	register_qdisc(&bfifo_qdisc_ops);
2147 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2148 	register_qdisc(&mq_qdisc_ops);
2149 	register_qdisc(&noqueue_qdisc_ops);
2150 
2151 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2152 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2153 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2154 		      0);
2155 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2156 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2157 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2158 		      0);
2159 
2160 	return 0;
2161 }
2162 
2163 subsys_initcall(pktsched_init);
2164