xref: /openbmc/linux/net/sched/sch_api.c (revision ae213c44)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38 
39 /*
40 
41    Short review.
42    -------------
43 
44    This file consists of two interrelated parts:
45 
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48 
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53 
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58 
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61 
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67 
68    All real intelligent work is done inside qdisc modules.
69 
70 
71 
72    Every discipline has two major routines: enqueue and dequeue.
73 
74    ---dequeue
75 
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82 
83    ---enqueue
84 
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP 	- this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92 
93    Auxiliary routines:
94 
95    ---peek
96 
97    like dequeue but without removing a packet from the queue
98 
99    ---reset
100 
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103 
104    ---init
105 
106    initializes newly created qdisc.
107 
108    ---destroy
109 
110    destroys resources allocated by init and during lifetime of qdisc.
111 
112    ---change
113 
114    changes qdisc parameters.
115  */
116 
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119 
120 
121 /************************************************
122  *	Queueing disciplines manipulation.	*
123  ************************************************/
124 
125 
126 /* The list of all installed queueing disciplines. */
127 
128 static struct Qdisc_ops *qdisc_base;
129 
130 /* Register/unregister queueing discipline */
131 
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134 	struct Qdisc_ops *q, **qp;
135 	int rc = -EEXIST;
136 
137 	write_lock(&qdisc_mod_lock);
138 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139 		if (!strcmp(qops->id, q->id))
140 			goto out;
141 
142 	if (qops->enqueue == NULL)
143 		qops->enqueue = noop_qdisc_ops.enqueue;
144 	if (qops->peek == NULL) {
145 		if (qops->dequeue == NULL)
146 			qops->peek = noop_qdisc_ops.peek;
147 		else
148 			goto out_einval;
149 	}
150 	if (qops->dequeue == NULL)
151 		qops->dequeue = noop_qdisc_ops.dequeue;
152 
153 	if (qops->cl_ops) {
154 		const struct Qdisc_class_ops *cops = qops->cl_ops;
155 
156 		if (!(cops->find && cops->walk && cops->leaf))
157 			goto out_einval;
158 
159 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160 			goto out_einval;
161 	}
162 
163 	qops->next = NULL;
164 	*qp = qops;
165 	rc = 0;
166 out:
167 	write_unlock(&qdisc_mod_lock);
168 	return rc;
169 
170 out_einval:
171 	rc = -EINVAL;
172 	goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175 
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178 	struct Qdisc_ops *q, **qp;
179 	int err = -ENOENT;
180 
181 	write_lock(&qdisc_mod_lock);
182 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183 		if (q == qops)
184 			break;
185 	if (q) {
186 		*qp = q->next;
187 		q->next = NULL;
188 		err = 0;
189 	}
190 	write_unlock(&qdisc_mod_lock);
191 	return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strlcpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module("sch_%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273 		if (q->handle == handle)
274 			return q;
275 	}
276 	return NULL;
277 }
278 
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282 		ASSERT_RTNL();
283 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284 		if (invisible)
285 			q->flags |= TCQ_F_INVISIBLE;
286 	}
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289 
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293 		ASSERT_RTNL();
294 		hash_del_rcu(&q->hash);
295 	}
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298 
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301 	struct Qdisc *q;
302 
303 	if (!handle)
304 		return NULL;
305 	q = qdisc_match_from_root(dev->qdisc, handle);
306 	if (q)
307 		goto out;
308 
309 	if (dev_ingress_queue(dev))
310 		q = qdisc_match_from_root(
311 			dev_ingress_queue(dev)->qdisc_sleeping,
312 			handle);
313 out:
314 	return q;
315 }
316 
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319 	struct netdev_queue *nq;
320 	struct Qdisc *q;
321 
322 	if (!handle)
323 		return NULL;
324 	q = qdisc_match_from_root(dev->qdisc, handle);
325 	if (q)
326 		goto out;
327 
328 	nq = dev_ingress_queue_rcu(dev);
329 	if (nq)
330 		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332 	return q;
333 }
334 
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337 	unsigned long cl;
338 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339 
340 	if (cops == NULL)
341 		return NULL;
342 	cl = cops->find(p, classid);
343 
344 	if (cl == 0)
345 		return NULL;
346 	return cops->leaf(p, cl);
347 }
348 
349 /* Find queueing discipline by name */
350 
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353 	struct Qdisc_ops *q = NULL;
354 
355 	if (kind) {
356 		read_lock(&qdisc_mod_lock);
357 		for (q = qdisc_base; q; q = q->next) {
358 			if (nla_strcmp(kind, q->id) == 0) {
359 				if (!try_module_get(q->owner))
360 					q = NULL;
361 				break;
362 			}
363 		}
364 		read_unlock(&qdisc_mod_lock);
365 	}
366 	return q;
367 }
368 
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388 	int low       = roundup(r->mpu, 48);
389 	int high      = roundup(low+1, 48);
390 	int cell_low  = low >> r->cell_log;
391 	int cell_high = (high >> r->cell_log) - 1;
392 
393 	/* rtab is too inaccurate at rates > 100Mbit/s */
394 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395 		pr_debug("TC linklayer: Giving up ATM detection\n");
396 		return TC_LINKLAYER_ETHERNET;
397 	}
398 
399 	if ((cell_high > cell_low) && (cell_high < 256)
400 	    && (rtab[cell_low] == rtab[cell_high])) {
401 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402 			 cell_low, cell_high, rtab[cell_high]);
403 		return TC_LINKLAYER_ATM;
404 	}
405 	return TC_LINKLAYER_ETHERNET;
406 }
407 
408 static struct qdisc_rate_table *qdisc_rtab_list;
409 
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411 					struct nlattr *tab,
412 					struct netlink_ext_ack *extack)
413 {
414 	struct qdisc_rate_table *rtab;
415 
416 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
417 	    nla_len(tab) != TC_RTAB_SIZE) {
418 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419 		return NULL;
420 	}
421 
422 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
425 			rtab->refcnt++;
426 			return rtab;
427 		}
428 	}
429 
430 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431 	if (rtab) {
432 		rtab->rate = *r;
433 		rtab->refcnt = 1;
434 		memcpy(rtab->data, nla_data(tab), 1024);
435 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
436 			r->linklayer = __detect_linklayer(r, rtab->data);
437 		rtab->next = qdisc_rtab_list;
438 		qdisc_rtab_list = rtab;
439 	} else {
440 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441 	}
442 	return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445 
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448 	struct qdisc_rate_table *rtab, **rtabp;
449 
450 	if (!tab || --tab->refcnt)
451 		return;
452 
453 	for (rtabp = &qdisc_rtab_list;
454 	     (rtab = *rtabp) != NULL;
455 	     rtabp = &rtab->next) {
456 		if (rtab == tab) {
457 			*rtabp = rtab->next;
458 			kfree(rtab);
459 			return;
460 		}
461 	}
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464 
465 static LIST_HEAD(qdisc_stab_list);
466 
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
469 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471 
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473 					       struct netlink_ext_ack *extack)
474 {
475 	struct nlattr *tb[TCA_STAB_MAX + 1];
476 	struct qdisc_size_table *stab;
477 	struct tc_sizespec *s;
478 	unsigned int tsize = 0;
479 	u16 *tab = NULL;
480 	int err;
481 
482 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483 					  extack);
484 	if (err < 0)
485 		return ERR_PTR(err);
486 	if (!tb[TCA_STAB_BASE]) {
487 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	s = nla_data(tb[TCA_STAB_BASE]);
492 
493 	if (s->tsize > 0) {
494 		if (!tb[TCA_STAB_DATA]) {
495 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496 			return ERR_PTR(-EINVAL);
497 		}
498 		tab = nla_data(tb[TCA_STAB_DATA]);
499 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500 	}
501 
502 	if (tsize != s->tsize || (!tab && tsize > 0)) {
503 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
504 		return ERR_PTR(-EINVAL);
505 	}
506 
507 	list_for_each_entry(stab, &qdisc_stab_list, list) {
508 		if (memcmp(&stab->szopts, s, sizeof(*s)))
509 			continue;
510 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511 			continue;
512 		stab->refcnt++;
513 		return stab;
514 	}
515 
516 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
517 	if (!stab)
518 		return ERR_PTR(-ENOMEM);
519 
520 	stab->refcnt = 1;
521 	stab->szopts = *s;
522 	if (tsize > 0)
523 		memcpy(stab->data, tab, tsize * sizeof(u16));
524 
525 	list_add_tail(&stab->list, &qdisc_stab_list);
526 
527 	return stab;
528 }
529 
530 void qdisc_put_stab(struct qdisc_size_table *tab)
531 {
532 	if (!tab)
533 		return;
534 
535 	if (--tab->refcnt == 0) {
536 		list_del(&tab->list);
537 		kfree_rcu(tab, rcu);
538 	}
539 }
540 EXPORT_SYMBOL(qdisc_put_stab);
541 
542 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
543 {
544 	struct nlattr *nest;
545 
546 	nest = nla_nest_start_noflag(skb, TCA_STAB);
547 	if (nest == NULL)
548 		goto nla_put_failure;
549 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
550 		goto nla_put_failure;
551 	nla_nest_end(skb, nest);
552 
553 	return skb->len;
554 
555 nla_put_failure:
556 	return -1;
557 }
558 
559 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
560 			       const struct qdisc_size_table *stab)
561 {
562 	int pkt_len, slot;
563 
564 	pkt_len = skb->len + stab->szopts.overhead;
565 	if (unlikely(!stab->szopts.tsize))
566 		goto out;
567 
568 	slot = pkt_len + stab->szopts.cell_align;
569 	if (unlikely(slot < 0))
570 		slot = 0;
571 
572 	slot >>= stab->szopts.cell_log;
573 	if (likely(slot < stab->szopts.tsize))
574 		pkt_len = stab->data[slot];
575 	else
576 		pkt_len = stab->data[stab->szopts.tsize - 1] *
577 				(slot / stab->szopts.tsize) +
578 				stab->data[slot % stab->szopts.tsize];
579 
580 	pkt_len <<= stab->szopts.size_log;
581 out:
582 	if (unlikely(pkt_len < 1))
583 		pkt_len = 1;
584 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
585 }
586 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
587 
588 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
589 {
590 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
591 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
592 			txt, qdisc->ops->id, qdisc->handle >> 16);
593 		qdisc->flags |= TCQ_F_WARN_NONWC;
594 	}
595 }
596 EXPORT_SYMBOL(qdisc_warn_nonwc);
597 
598 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
599 {
600 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
601 						 timer);
602 
603 	rcu_read_lock();
604 	__netif_schedule(qdisc_root(wd->qdisc));
605 	rcu_read_unlock();
606 
607 	return HRTIMER_NORESTART;
608 }
609 
610 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
611 				 clockid_t clockid)
612 {
613 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
614 	wd->timer.function = qdisc_watchdog;
615 	wd->qdisc = qdisc;
616 }
617 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
618 
619 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
620 {
621 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init);
624 
625 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
626 {
627 	if (test_bit(__QDISC_STATE_DEACTIVATED,
628 		     &qdisc_root_sleeping(wd->qdisc)->state))
629 		return;
630 
631 	if (wd->last_expires == expires)
632 		return;
633 
634 	wd->last_expires = expires;
635 	hrtimer_start(&wd->timer,
636 		      ns_to_ktime(expires),
637 		      HRTIMER_MODE_ABS_PINNED);
638 }
639 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
640 
641 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
642 {
643 	hrtimer_cancel(&wd->timer);
644 }
645 EXPORT_SYMBOL(qdisc_watchdog_cancel);
646 
647 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
648 {
649 	struct hlist_head *h;
650 	unsigned int i;
651 
652 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
653 
654 	if (h != NULL) {
655 		for (i = 0; i < n; i++)
656 			INIT_HLIST_HEAD(&h[i]);
657 	}
658 	return h;
659 }
660 
661 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
662 {
663 	struct Qdisc_class_common *cl;
664 	struct hlist_node *next;
665 	struct hlist_head *nhash, *ohash;
666 	unsigned int nsize, nmask, osize;
667 	unsigned int i, h;
668 
669 	/* Rehash when load factor exceeds 0.75 */
670 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
671 		return;
672 	nsize = clhash->hashsize * 2;
673 	nmask = nsize - 1;
674 	nhash = qdisc_class_hash_alloc(nsize);
675 	if (nhash == NULL)
676 		return;
677 
678 	ohash = clhash->hash;
679 	osize = clhash->hashsize;
680 
681 	sch_tree_lock(sch);
682 	for (i = 0; i < osize; i++) {
683 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
684 			h = qdisc_class_hash(cl->classid, nmask);
685 			hlist_add_head(&cl->hnode, &nhash[h]);
686 		}
687 	}
688 	clhash->hash     = nhash;
689 	clhash->hashsize = nsize;
690 	clhash->hashmask = nmask;
691 	sch_tree_unlock(sch);
692 
693 	kvfree(ohash);
694 }
695 EXPORT_SYMBOL(qdisc_class_hash_grow);
696 
697 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
698 {
699 	unsigned int size = 4;
700 
701 	clhash->hash = qdisc_class_hash_alloc(size);
702 	if (!clhash->hash)
703 		return -ENOMEM;
704 	clhash->hashsize  = size;
705 	clhash->hashmask  = size - 1;
706 	clhash->hashelems = 0;
707 	return 0;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_init);
710 
711 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
712 {
713 	kvfree(clhash->hash);
714 }
715 EXPORT_SYMBOL(qdisc_class_hash_destroy);
716 
717 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
718 			     struct Qdisc_class_common *cl)
719 {
720 	unsigned int h;
721 
722 	INIT_HLIST_NODE(&cl->hnode);
723 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
724 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
725 	clhash->hashelems++;
726 }
727 EXPORT_SYMBOL(qdisc_class_hash_insert);
728 
729 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
730 			     struct Qdisc_class_common *cl)
731 {
732 	hlist_del(&cl->hnode);
733 	clhash->hashelems--;
734 }
735 EXPORT_SYMBOL(qdisc_class_hash_remove);
736 
737 /* Allocate an unique handle from space managed by kernel
738  * Possible range is [8000-FFFF]:0000 (0x8000 values)
739  */
740 static u32 qdisc_alloc_handle(struct net_device *dev)
741 {
742 	int i = 0x8000;
743 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
744 
745 	do {
746 		autohandle += TC_H_MAKE(0x10000U, 0);
747 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
748 			autohandle = TC_H_MAKE(0x80000000U, 0);
749 		if (!qdisc_lookup(dev, autohandle))
750 			return autohandle;
751 		cond_resched();
752 	} while	(--i > 0);
753 
754 	return 0;
755 }
756 
757 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
758 {
759 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
760 	const struct Qdisc_class_ops *cops;
761 	unsigned long cl;
762 	u32 parentid;
763 	bool notify;
764 	int drops;
765 
766 	if (n == 0 && len == 0)
767 		return;
768 	drops = max_t(int, n, 0);
769 	rcu_read_lock();
770 	while ((parentid = sch->parent)) {
771 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
772 			break;
773 
774 		if (sch->flags & TCQ_F_NOPARENT)
775 			break;
776 		/* Notify parent qdisc only if child qdisc becomes empty.
777 		 *
778 		 * If child was empty even before update then backlog
779 		 * counter is screwed and we skip notification because
780 		 * parent class is already passive.
781 		 *
782 		 * If the original child was offloaded then it is allowed
783 		 * to be seem as empty, so the parent is notified anyway.
784 		 */
785 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
786 						       !qdisc_is_offloaded);
787 		/* TODO: perform the search on a per txq basis */
788 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
789 		if (sch == NULL) {
790 			WARN_ON_ONCE(parentid != TC_H_ROOT);
791 			break;
792 		}
793 		cops = sch->ops->cl_ops;
794 		if (notify && cops->qlen_notify) {
795 			cl = cops->find(sch, parentid);
796 			cops->qlen_notify(sch, cl);
797 		}
798 		sch->q.qlen -= n;
799 		sch->qstats.backlog -= len;
800 		__qdisc_qstats_drop(sch, drops);
801 	}
802 	rcu_read_unlock();
803 }
804 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
805 
806 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
807 			      void *type_data)
808 {
809 	struct net_device *dev = qdisc_dev(sch);
810 	int err;
811 
812 	sch->flags &= ~TCQ_F_OFFLOADED;
813 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
814 		return 0;
815 
816 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
817 	if (err == -EOPNOTSUPP)
818 		return 0;
819 
820 	if (!err)
821 		sch->flags |= TCQ_F_OFFLOADED;
822 
823 	return err;
824 }
825 EXPORT_SYMBOL(qdisc_offload_dump_helper);
826 
827 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
828 				struct Qdisc *new, struct Qdisc *old,
829 				enum tc_setup_type type, void *type_data,
830 				struct netlink_ext_ack *extack)
831 {
832 	bool any_qdisc_is_offloaded;
833 	int err;
834 
835 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
836 		return;
837 
838 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
839 
840 	/* Don't report error if the graft is part of destroy operation. */
841 	if (!err || !new || new == &noop_qdisc)
842 		return;
843 
844 	/* Don't report error if the parent, the old child and the new
845 	 * one are not offloaded.
846 	 */
847 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
848 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
849 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
850 
851 	if (any_qdisc_is_offloaded)
852 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
853 }
854 EXPORT_SYMBOL(qdisc_offload_graft_helper);
855 
856 static void qdisc_offload_graft_root(struct net_device *dev,
857 				     struct Qdisc *new, struct Qdisc *old,
858 				     struct netlink_ext_ack *extack)
859 {
860 	struct tc_root_qopt_offload graft_offload = {
861 		.command	= TC_ROOT_GRAFT,
862 		.handle		= new ? new->handle : 0,
863 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
864 				  (old && old->flags & TCQ_F_INGRESS),
865 	};
866 
867 	qdisc_offload_graft_helper(dev, NULL, new, old,
868 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
869 }
870 
871 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
872 			 u32 portid, u32 seq, u16 flags, int event)
873 {
874 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
875 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
876 	struct tcmsg *tcm;
877 	struct nlmsghdr  *nlh;
878 	unsigned char *b = skb_tail_pointer(skb);
879 	struct gnet_dump d;
880 	struct qdisc_size_table *stab;
881 	u32 block_index;
882 	__u32 qlen;
883 
884 	cond_resched();
885 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
886 	if (!nlh)
887 		goto out_nlmsg_trim;
888 	tcm = nlmsg_data(nlh);
889 	tcm->tcm_family = AF_UNSPEC;
890 	tcm->tcm__pad1 = 0;
891 	tcm->tcm__pad2 = 0;
892 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
893 	tcm->tcm_parent = clid;
894 	tcm->tcm_handle = q->handle;
895 	tcm->tcm_info = refcount_read(&q->refcnt);
896 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
897 		goto nla_put_failure;
898 	if (q->ops->ingress_block_get) {
899 		block_index = q->ops->ingress_block_get(q);
900 		if (block_index &&
901 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
902 			goto nla_put_failure;
903 	}
904 	if (q->ops->egress_block_get) {
905 		block_index = q->ops->egress_block_get(q);
906 		if (block_index &&
907 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
908 			goto nla_put_failure;
909 	}
910 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
911 		goto nla_put_failure;
912 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
913 		goto nla_put_failure;
914 	qlen = qdisc_qlen_sum(q);
915 
916 	stab = rtnl_dereference(q->stab);
917 	if (stab && qdisc_dump_stab(skb, stab) < 0)
918 		goto nla_put_failure;
919 
920 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
921 					 NULL, &d, TCA_PAD) < 0)
922 		goto nla_put_failure;
923 
924 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
925 		goto nla_put_failure;
926 
927 	if (qdisc_is_percpu_stats(q)) {
928 		cpu_bstats = q->cpu_bstats;
929 		cpu_qstats = q->cpu_qstats;
930 	}
931 
932 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
933 				  &d, cpu_bstats, &q->bstats) < 0 ||
934 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
935 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
936 		goto nla_put_failure;
937 
938 	if (gnet_stats_finish_copy(&d) < 0)
939 		goto nla_put_failure;
940 
941 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
942 	return skb->len;
943 
944 out_nlmsg_trim:
945 nla_put_failure:
946 	nlmsg_trim(skb, b);
947 	return -1;
948 }
949 
950 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
951 {
952 	if (q->flags & TCQ_F_BUILTIN)
953 		return true;
954 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
955 		return true;
956 
957 	return false;
958 }
959 
960 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
961 			struct nlmsghdr *n, u32 clid,
962 			struct Qdisc *old, struct Qdisc *new)
963 {
964 	struct sk_buff *skb;
965 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
966 
967 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
968 	if (!skb)
969 		return -ENOBUFS;
970 
971 	if (old && !tc_qdisc_dump_ignore(old, false)) {
972 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
973 				  0, RTM_DELQDISC) < 0)
974 			goto err_out;
975 	}
976 	if (new && !tc_qdisc_dump_ignore(new, false)) {
977 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
978 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
979 			goto err_out;
980 	}
981 
982 	if (skb->len)
983 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
984 				      n->nlmsg_flags & NLM_F_ECHO);
985 
986 err_out:
987 	kfree_skb(skb);
988 	return -EINVAL;
989 }
990 
991 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
992 			       struct nlmsghdr *n, u32 clid,
993 			       struct Qdisc *old, struct Qdisc *new)
994 {
995 	if (new || old)
996 		qdisc_notify(net, skb, n, clid, old, new);
997 
998 	if (old)
999 		qdisc_put(old);
1000 }
1001 
1002 static void qdisc_clear_nolock(struct Qdisc *sch)
1003 {
1004 	sch->flags &= ~TCQ_F_NOLOCK;
1005 	if (!(sch->flags & TCQ_F_CPUSTATS))
1006 		return;
1007 
1008 	free_percpu(sch->cpu_bstats);
1009 	free_percpu(sch->cpu_qstats);
1010 	sch->cpu_bstats = NULL;
1011 	sch->cpu_qstats = NULL;
1012 	sch->flags &= ~TCQ_F_CPUSTATS;
1013 }
1014 
1015 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1016  * to device "dev".
1017  *
1018  * When appropriate send a netlink notification using 'skb'
1019  * and "n".
1020  *
1021  * On success, destroy old qdisc.
1022  */
1023 
1024 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1025 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1026 		       struct Qdisc *new, struct Qdisc *old,
1027 		       struct netlink_ext_ack *extack)
1028 {
1029 	struct Qdisc *q = old;
1030 	struct net *net = dev_net(dev);
1031 
1032 	if (parent == NULL) {
1033 		unsigned int i, num_q, ingress;
1034 
1035 		ingress = 0;
1036 		num_q = dev->num_tx_queues;
1037 		if ((q && q->flags & TCQ_F_INGRESS) ||
1038 		    (new && new->flags & TCQ_F_INGRESS)) {
1039 			num_q = 1;
1040 			ingress = 1;
1041 			if (!dev_ingress_queue(dev)) {
1042 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1043 				return -ENOENT;
1044 			}
1045 		}
1046 
1047 		if (dev->flags & IFF_UP)
1048 			dev_deactivate(dev);
1049 
1050 		qdisc_offload_graft_root(dev, new, old, extack);
1051 
1052 		if (new && new->ops->attach)
1053 			goto skip;
1054 
1055 		for (i = 0; i < num_q; i++) {
1056 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1057 
1058 			if (!ingress)
1059 				dev_queue = netdev_get_tx_queue(dev, i);
1060 
1061 			old = dev_graft_qdisc(dev_queue, new);
1062 			if (new && i > 0)
1063 				qdisc_refcount_inc(new);
1064 
1065 			if (!ingress)
1066 				qdisc_put(old);
1067 		}
1068 
1069 skip:
1070 		if (!ingress) {
1071 			notify_and_destroy(net, skb, n, classid,
1072 					   dev->qdisc, new);
1073 			if (new && !new->ops->attach)
1074 				qdisc_refcount_inc(new);
1075 			dev->qdisc = new ? : &noop_qdisc;
1076 
1077 			if (new && new->ops->attach)
1078 				new->ops->attach(new);
1079 		} else {
1080 			notify_and_destroy(net, skb, n, classid, old, new);
1081 		}
1082 
1083 		if (dev->flags & IFF_UP)
1084 			dev_activate(dev);
1085 	} else {
1086 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1087 		unsigned long cl;
1088 		int err;
1089 
1090 		/* Only support running class lockless if parent is lockless */
1091 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1092 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1093 			qdisc_clear_nolock(new);
1094 
1095 		if (!cops || !cops->graft)
1096 			return -EOPNOTSUPP;
1097 
1098 		cl = cops->find(parent, classid);
1099 		if (!cl) {
1100 			NL_SET_ERR_MSG(extack, "Specified class not found");
1101 			return -ENOENT;
1102 		}
1103 
1104 		err = cops->graft(parent, cl, new, &old, extack);
1105 		if (err)
1106 			return err;
1107 		notify_and_destroy(net, skb, n, classid, old, new);
1108 	}
1109 	return 0;
1110 }
1111 
1112 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1113 				   struct netlink_ext_ack *extack)
1114 {
1115 	u32 block_index;
1116 
1117 	if (tca[TCA_INGRESS_BLOCK]) {
1118 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1119 
1120 		if (!block_index) {
1121 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1122 			return -EINVAL;
1123 		}
1124 		if (!sch->ops->ingress_block_set) {
1125 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1126 			return -EOPNOTSUPP;
1127 		}
1128 		sch->ops->ingress_block_set(sch, block_index);
1129 	}
1130 	if (tca[TCA_EGRESS_BLOCK]) {
1131 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1132 
1133 		if (!block_index) {
1134 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1135 			return -EINVAL;
1136 		}
1137 		if (!sch->ops->egress_block_set) {
1138 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1139 			return -EOPNOTSUPP;
1140 		}
1141 		sch->ops->egress_block_set(sch, block_index);
1142 	}
1143 	return 0;
1144 }
1145 
1146 /*
1147    Allocate and initialize new qdisc.
1148 
1149    Parameters are passed via opt.
1150  */
1151 
1152 static struct Qdisc *qdisc_create(struct net_device *dev,
1153 				  struct netdev_queue *dev_queue,
1154 				  struct Qdisc *p, u32 parent, u32 handle,
1155 				  struct nlattr **tca, int *errp,
1156 				  struct netlink_ext_ack *extack)
1157 {
1158 	int err;
1159 	struct nlattr *kind = tca[TCA_KIND];
1160 	struct Qdisc *sch;
1161 	struct Qdisc_ops *ops;
1162 	struct qdisc_size_table *stab;
1163 
1164 	ops = qdisc_lookup_ops(kind);
1165 #ifdef CONFIG_MODULES
1166 	if (ops == NULL && kind != NULL) {
1167 		char name[IFNAMSIZ];
1168 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1169 			/* We dropped the RTNL semaphore in order to
1170 			 * perform the module load.  So, even if we
1171 			 * succeeded in loading the module we have to
1172 			 * tell the caller to replay the request.  We
1173 			 * indicate this using -EAGAIN.
1174 			 * We replay the request because the device may
1175 			 * go away in the mean time.
1176 			 */
1177 			rtnl_unlock();
1178 			request_module("sch_%s", name);
1179 			rtnl_lock();
1180 			ops = qdisc_lookup_ops(kind);
1181 			if (ops != NULL) {
1182 				/* We will try again qdisc_lookup_ops,
1183 				 * so don't keep a reference.
1184 				 */
1185 				module_put(ops->owner);
1186 				err = -EAGAIN;
1187 				goto err_out;
1188 			}
1189 		}
1190 	}
1191 #endif
1192 
1193 	err = -ENOENT;
1194 	if (!ops) {
1195 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1196 		goto err_out;
1197 	}
1198 
1199 	sch = qdisc_alloc(dev_queue, ops, extack);
1200 	if (IS_ERR(sch)) {
1201 		err = PTR_ERR(sch);
1202 		goto err_out2;
1203 	}
1204 
1205 	sch->parent = parent;
1206 
1207 	if (handle == TC_H_INGRESS) {
1208 		sch->flags |= TCQ_F_INGRESS;
1209 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1210 	} else {
1211 		if (handle == 0) {
1212 			handle = qdisc_alloc_handle(dev);
1213 			if (handle == 0) {
1214 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1215 				err = -ENOSPC;
1216 				goto err_out3;
1217 			}
1218 		}
1219 		if (!netif_is_multiqueue(dev))
1220 			sch->flags |= TCQ_F_ONETXQUEUE;
1221 	}
1222 
1223 	sch->handle = handle;
1224 
1225 	/* This exist to keep backward compatible with a userspace
1226 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1227 	 * facility on older kernels by setting tx_queue_len=0 (prior
1228 	 * to qdisc init), and then forgot to reinit tx_queue_len
1229 	 * before again attaching a qdisc.
1230 	 */
1231 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1232 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1233 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1234 	}
1235 
1236 	err = qdisc_block_indexes_set(sch, tca, extack);
1237 	if (err)
1238 		goto err_out3;
1239 
1240 	if (ops->init) {
1241 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1242 		if (err != 0)
1243 			goto err_out5;
1244 	}
1245 
1246 	if (tca[TCA_STAB]) {
1247 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1248 		if (IS_ERR(stab)) {
1249 			err = PTR_ERR(stab);
1250 			goto err_out4;
1251 		}
1252 		rcu_assign_pointer(sch->stab, stab);
1253 	}
1254 	if (tca[TCA_RATE]) {
1255 		seqcount_t *running;
1256 
1257 		err = -EOPNOTSUPP;
1258 		if (sch->flags & TCQ_F_MQROOT) {
1259 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1260 			goto err_out4;
1261 		}
1262 
1263 		if (sch->parent != TC_H_ROOT &&
1264 		    !(sch->flags & TCQ_F_INGRESS) &&
1265 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1266 			running = qdisc_root_sleeping_running(sch);
1267 		else
1268 			running = &sch->running;
1269 
1270 		err = gen_new_estimator(&sch->bstats,
1271 					sch->cpu_bstats,
1272 					&sch->rate_est,
1273 					NULL,
1274 					running,
1275 					tca[TCA_RATE]);
1276 		if (err) {
1277 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1278 			goto err_out4;
1279 		}
1280 	}
1281 
1282 	qdisc_hash_add(sch, false);
1283 
1284 	return sch;
1285 
1286 err_out5:
1287 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1288 	if (ops->destroy)
1289 		ops->destroy(sch);
1290 err_out3:
1291 	dev_put(dev);
1292 	qdisc_free(sch);
1293 err_out2:
1294 	module_put(ops->owner);
1295 err_out:
1296 	*errp = err;
1297 	return NULL;
1298 
1299 err_out4:
1300 	/*
1301 	 * Any broken qdiscs that would require a ops->reset() here?
1302 	 * The qdisc was never in action so it shouldn't be necessary.
1303 	 */
1304 	qdisc_put_stab(rtnl_dereference(sch->stab));
1305 	if (ops->destroy)
1306 		ops->destroy(sch);
1307 	goto err_out3;
1308 }
1309 
1310 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1311 			struct netlink_ext_ack *extack)
1312 {
1313 	struct qdisc_size_table *ostab, *stab = NULL;
1314 	int err = 0;
1315 
1316 	if (tca[TCA_OPTIONS]) {
1317 		if (!sch->ops->change) {
1318 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1319 			return -EINVAL;
1320 		}
1321 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1322 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1323 			return -EOPNOTSUPP;
1324 		}
1325 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1326 		if (err)
1327 			return err;
1328 	}
1329 
1330 	if (tca[TCA_STAB]) {
1331 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1332 		if (IS_ERR(stab))
1333 			return PTR_ERR(stab);
1334 	}
1335 
1336 	ostab = rtnl_dereference(sch->stab);
1337 	rcu_assign_pointer(sch->stab, stab);
1338 	qdisc_put_stab(ostab);
1339 
1340 	if (tca[TCA_RATE]) {
1341 		/* NB: ignores errors from replace_estimator
1342 		   because change can't be undone. */
1343 		if (sch->flags & TCQ_F_MQROOT)
1344 			goto out;
1345 		gen_replace_estimator(&sch->bstats,
1346 				      sch->cpu_bstats,
1347 				      &sch->rate_est,
1348 				      NULL,
1349 				      qdisc_root_sleeping_running(sch),
1350 				      tca[TCA_RATE]);
1351 	}
1352 out:
1353 	return 0;
1354 }
1355 
1356 struct check_loop_arg {
1357 	struct qdisc_walker	w;
1358 	struct Qdisc		*p;
1359 	int			depth;
1360 };
1361 
1362 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1363 			 struct qdisc_walker *w);
1364 
1365 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1366 {
1367 	struct check_loop_arg	arg;
1368 
1369 	if (q->ops->cl_ops == NULL)
1370 		return 0;
1371 
1372 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1373 	arg.w.fn = check_loop_fn;
1374 	arg.depth = depth;
1375 	arg.p = p;
1376 	q->ops->cl_ops->walk(q, &arg.w);
1377 	return arg.w.stop ? -ELOOP : 0;
1378 }
1379 
1380 static int
1381 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1382 {
1383 	struct Qdisc *leaf;
1384 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1385 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1386 
1387 	leaf = cops->leaf(q, cl);
1388 	if (leaf) {
1389 		if (leaf == arg->p || arg->depth > 7)
1390 			return -ELOOP;
1391 		return check_loop(leaf, arg->p, arg->depth + 1);
1392 	}
1393 	return 0;
1394 }
1395 
1396 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1397 	[TCA_KIND]		= { .type = NLA_STRING },
1398 	[TCA_RATE]		= { .type = NLA_BINARY,
1399 				    .len = sizeof(struct tc_estimator) },
1400 	[TCA_STAB]		= { .type = NLA_NESTED },
1401 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1402 	[TCA_CHAIN]		= { .type = NLA_U32 },
1403 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1404 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1405 };
1406 
1407 /*
1408  * Delete/get qdisc.
1409  */
1410 
1411 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1412 			struct netlink_ext_ack *extack)
1413 {
1414 	struct net *net = sock_net(skb->sk);
1415 	struct tcmsg *tcm = nlmsg_data(n);
1416 	struct nlattr *tca[TCA_MAX + 1];
1417 	struct net_device *dev;
1418 	u32 clid;
1419 	struct Qdisc *q = NULL;
1420 	struct Qdisc *p = NULL;
1421 	int err;
1422 
1423 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1424 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1425 		return -EPERM;
1426 
1427 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1428 				     rtm_tca_policy, extack);
1429 	if (err < 0)
1430 		return err;
1431 
1432 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1433 	if (!dev)
1434 		return -ENODEV;
1435 
1436 	clid = tcm->tcm_parent;
1437 	if (clid) {
1438 		if (clid != TC_H_ROOT) {
1439 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1440 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1441 				if (!p) {
1442 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1443 					return -ENOENT;
1444 				}
1445 				q = qdisc_leaf(p, clid);
1446 			} else if (dev_ingress_queue(dev)) {
1447 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1448 			}
1449 		} else {
1450 			q = dev->qdisc;
1451 		}
1452 		if (!q) {
1453 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1454 			return -ENOENT;
1455 		}
1456 
1457 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1458 			NL_SET_ERR_MSG(extack, "Invalid handle");
1459 			return -EINVAL;
1460 		}
1461 	} else {
1462 		q = qdisc_lookup(dev, tcm->tcm_handle);
1463 		if (!q) {
1464 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1465 			return -ENOENT;
1466 		}
1467 	}
1468 
1469 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1470 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1471 		return -EINVAL;
1472 	}
1473 
1474 	if (n->nlmsg_type == RTM_DELQDISC) {
1475 		if (!clid) {
1476 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1477 			return -EINVAL;
1478 		}
1479 		if (q->handle == 0) {
1480 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1481 			return -ENOENT;
1482 		}
1483 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1484 		if (err != 0)
1485 			return err;
1486 	} else {
1487 		qdisc_notify(net, skb, n, clid, NULL, q);
1488 	}
1489 	return 0;
1490 }
1491 
1492 /*
1493  * Create/change qdisc.
1494  */
1495 
1496 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1497 			   struct netlink_ext_ack *extack)
1498 {
1499 	struct net *net = sock_net(skb->sk);
1500 	struct tcmsg *tcm;
1501 	struct nlattr *tca[TCA_MAX + 1];
1502 	struct net_device *dev;
1503 	u32 clid;
1504 	struct Qdisc *q, *p;
1505 	int err;
1506 
1507 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1508 		return -EPERM;
1509 
1510 replay:
1511 	/* Reinit, just in case something touches this. */
1512 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1513 				     rtm_tca_policy, extack);
1514 	if (err < 0)
1515 		return err;
1516 
1517 	tcm = nlmsg_data(n);
1518 	clid = tcm->tcm_parent;
1519 	q = p = NULL;
1520 
1521 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1522 	if (!dev)
1523 		return -ENODEV;
1524 
1525 
1526 	if (clid) {
1527 		if (clid != TC_H_ROOT) {
1528 			if (clid != TC_H_INGRESS) {
1529 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1530 				if (!p) {
1531 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1532 					return -ENOENT;
1533 				}
1534 				q = qdisc_leaf(p, clid);
1535 			} else if (dev_ingress_queue_create(dev)) {
1536 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1537 			}
1538 		} else {
1539 			q = dev->qdisc;
1540 		}
1541 
1542 		/* It may be default qdisc, ignore it */
1543 		if (q && q->handle == 0)
1544 			q = NULL;
1545 
1546 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1547 			if (tcm->tcm_handle) {
1548 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1549 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1550 					return -EEXIST;
1551 				}
1552 				if (TC_H_MIN(tcm->tcm_handle)) {
1553 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1554 					return -EINVAL;
1555 				}
1556 				q = qdisc_lookup(dev, tcm->tcm_handle);
1557 				if (!q)
1558 					goto create_n_graft;
1559 				if (n->nlmsg_flags & NLM_F_EXCL) {
1560 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1561 					return -EEXIST;
1562 				}
1563 				if (tca[TCA_KIND] &&
1564 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1565 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1566 					return -EINVAL;
1567 				}
1568 				if (q == p ||
1569 				    (p && check_loop(q, p, 0))) {
1570 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1571 					return -ELOOP;
1572 				}
1573 				qdisc_refcount_inc(q);
1574 				goto graft;
1575 			} else {
1576 				if (!q)
1577 					goto create_n_graft;
1578 
1579 				/* This magic test requires explanation.
1580 				 *
1581 				 *   We know, that some child q is already
1582 				 *   attached to this parent and have choice:
1583 				 *   either to change it or to create/graft new one.
1584 				 *
1585 				 *   1. We are allowed to create/graft only
1586 				 *   if CREATE and REPLACE flags are set.
1587 				 *
1588 				 *   2. If EXCL is set, requestor wanted to say,
1589 				 *   that qdisc tcm_handle is not expected
1590 				 *   to exist, so that we choose create/graft too.
1591 				 *
1592 				 *   3. The last case is when no flags are set.
1593 				 *   Alas, it is sort of hole in API, we
1594 				 *   cannot decide what to do unambiguously.
1595 				 *   For now we select create/graft, if
1596 				 *   user gave KIND, which does not match existing.
1597 				 */
1598 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1599 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1600 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1601 				     (tca[TCA_KIND] &&
1602 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1603 					goto create_n_graft;
1604 			}
1605 		}
1606 	} else {
1607 		if (!tcm->tcm_handle) {
1608 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1609 			return -EINVAL;
1610 		}
1611 		q = qdisc_lookup(dev, tcm->tcm_handle);
1612 	}
1613 
1614 	/* Change qdisc parameters */
1615 	if (!q) {
1616 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1617 		return -ENOENT;
1618 	}
1619 	if (n->nlmsg_flags & NLM_F_EXCL) {
1620 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1621 		return -EEXIST;
1622 	}
1623 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1624 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1625 		return -EINVAL;
1626 	}
1627 	err = qdisc_change(q, tca, extack);
1628 	if (err == 0)
1629 		qdisc_notify(net, skb, n, clid, NULL, q);
1630 	return err;
1631 
1632 create_n_graft:
1633 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1634 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1635 		return -ENOENT;
1636 	}
1637 	if (clid == TC_H_INGRESS) {
1638 		if (dev_ingress_queue(dev)) {
1639 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1640 					 tcm->tcm_parent, tcm->tcm_parent,
1641 					 tca, &err, extack);
1642 		} else {
1643 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1644 			err = -ENOENT;
1645 		}
1646 	} else {
1647 		struct netdev_queue *dev_queue;
1648 
1649 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1650 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1651 		else if (p)
1652 			dev_queue = p->dev_queue;
1653 		else
1654 			dev_queue = netdev_get_tx_queue(dev, 0);
1655 
1656 		q = qdisc_create(dev, dev_queue, p,
1657 				 tcm->tcm_parent, tcm->tcm_handle,
1658 				 tca, &err, extack);
1659 	}
1660 	if (q == NULL) {
1661 		if (err == -EAGAIN)
1662 			goto replay;
1663 		return err;
1664 	}
1665 
1666 graft:
1667 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1668 	if (err) {
1669 		if (q)
1670 			qdisc_put(q);
1671 		return err;
1672 	}
1673 
1674 	return 0;
1675 }
1676 
1677 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1678 			      struct netlink_callback *cb,
1679 			      int *q_idx_p, int s_q_idx, bool recur,
1680 			      bool dump_invisible)
1681 {
1682 	int ret = 0, q_idx = *q_idx_p;
1683 	struct Qdisc *q;
1684 	int b;
1685 
1686 	if (!root)
1687 		return 0;
1688 
1689 	q = root;
1690 	if (q_idx < s_q_idx) {
1691 		q_idx++;
1692 	} else {
1693 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1694 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1695 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1696 				  RTM_NEWQDISC) <= 0)
1697 			goto done;
1698 		q_idx++;
1699 	}
1700 
1701 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1702 	 * itself has already been dumped.
1703 	 *
1704 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1705 	 * qdisc hashtable, we don't want to hit it again
1706 	 */
1707 	if (!qdisc_dev(root) || !recur)
1708 		goto out;
1709 
1710 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1711 		if (q_idx < s_q_idx) {
1712 			q_idx++;
1713 			continue;
1714 		}
1715 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1716 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1717 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1718 				  RTM_NEWQDISC) <= 0)
1719 			goto done;
1720 		q_idx++;
1721 	}
1722 
1723 out:
1724 	*q_idx_p = q_idx;
1725 	return ret;
1726 done:
1727 	ret = -1;
1728 	goto out;
1729 }
1730 
1731 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1732 {
1733 	struct net *net = sock_net(skb->sk);
1734 	int idx, q_idx;
1735 	int s_idx, s_q_idx;
1736 	struct net_device *dev;
1737 	const struct nlmsghdr *nlh = cb->nlh;
1738 	struct nlattr *tca[TCA_MAX + 1];
1739 	int err;
1740 
1741 	s_idx = cb->args[0];
1742 	s_q_idx = q_idx = cb->args[1];
1743 
1744 	idx = 0;
1745 	ASSERT_RTNL();
1746 
1747 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1748 				     rtm_tca_policy, cb->extack);
1749 	if (err < 0)
1750 		return err;
1751 
1752 	for_each_netdev(net, dev) {
1753 		struct netdev_queue *dev_queue;
1754 
1755 		if (idx < s_idx)
1756 			goto cont;
1757 		if (idx > s_idx)
1758 			s_q_idx = 0;
1759 		q_idx = 0;
1760 
1761 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1762 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1763 			goto done;
1764 
1765 		dev_queue = dev_ingress_queue(dev);
1766 		if (dev_queue &&
1767 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1768 				       &q_idx, s_q_idx, false,
1769 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1770 			goto done;
1771 
1772 cont:
1773 		idx++;
1774 	}
1775 
1776 done:
1777 	cb->args[0] = idx;
1778 	cb->args[1] = q_idx;
1779 
1780 	return skb->len;
1781 }
1782 
1783 
1784 
1785 /************************************************
1786  *	Traffic classes manipulation.		*
1787  ************************************************/
1788 
1789 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1790 			  unsigned long cl,
1791 			  u32 portid, u32 seq, u16 flags, int event)
1792 {
1793 	struct tcmsg *tcm;
1794 	struct nlmsghdr  *nlh;
1795 	unsigned char *b = skb_tail_pointer(skb);
1796 	struct gnet_dump d;
1797 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1798 
1799 	cond_resched();
1800 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1801 	if (!nlh)
1802 		goto out_nlmsg_trim;
1803 	tcm = nlmsg_data(nlh);
1804 	tcm->tcm_family = AF_UNSPEC;
1805 	tcm->tcm__pad1 = 0;
1806 	tcm->tcm__pad2 = 0;
1807 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1808 	tcm->tcm_parent = q->handle;
1809 	tcm->tcm_handle = q->handle;
1810 	tcm->tcm_info = 0;
1811 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1812 		goto nla_put_failure;
1813 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1814 		goto nla_put_failure;
1815 
1816 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1817 					 NULL, &d, TCA_PAD) < 0)
1818 		goto nla_put_failure;
1819 
1820 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1821 		goto nla_put_failure;
1822 
1823 	if (gnet_stats_finish_copy(&d) < 0)
1824 		goto nla_put_failure;
1825 
1826 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1827 	return skb->len;
1828 
1829 out_nlmsg_trim:
1830 nla_put_failure:
1831 	nlmsg_trim(skb, b);
1832 	return -1;
1833 }
1834 
1835 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1836 			 struct nlmsghdr *n, struct Qdisc *q,
1837 			 unsigned long cl, int event)
1838 {
1839 	struct sk_buff *skb;
1840 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1841 	int err = 0;
1842 
1843 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1844 	if (!skb)
1845 		return -ENOBUFS;
1846 
1847 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1848 		kfree_skb(skb);
1849 		return -EINVAL;
1850 	}
1851 
1852 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1853 			     n->nlmsg_flags & NLM_F_ECHO);
1854 	if (err > 0)
1855 		err = 0;
1856 	return err;
1857 }
1858 
1859 static int tclass_del_notify(struct net *net,
1860 			     const struct Qdisc_class_ops *cops,
1861 			     struct sk_buff *oskb, struct nlmsghdr *n,
1862 			     struct Qdisc *q, unsigned long cl)
1863 {
1864 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1865 	struct sk_buff *skb;
1866 	int err = 0;
1867 
1868 	if (!cops->delete)
1869 		return -EOPNOTSUPP;
1870 
1871 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1872 	if (!skb)
1873 		return -ENOBUFS;
1874 
1875 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1876 			   RTM_DELTCLASS) < 0) {
1877 		kfree_skb(skb);
1878 		return -EINVAL;
1879 	}
1880 
1881 	err = cops->delete(q, cl);
1882 	if (err) {
1883 		kfree_skb(skb);
1884 		return err;
1885 	}
1886 
1887 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1888 			     n->nlmsg_flags & NLM_F_ECHO);
1889 	if (err > 0)
1890 		err = 0;
1891 	return err;
1892 }
1893 
1894 #ifdef CONFIG_NET_CLS
1895 
1896 struct tcf_bind_args {
1897 	struct tcf_walker w;
1898 	u32 classid;
1899 	unsigned long cl;
1900 };
1901 
1902 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1903 {
1904 	struct tcf_bind_args *a = (void *)arg;
1905 
1906 	if (tp->ops->bind_class) {
1907 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1908 
1909 		sch_tree_lock(q);
1910 		tp->ops->bind_class(n, a->classid, a->cl);
1911 		sch_tree_unlock(q);
1912 	}
1913 	return 0;
1914 }
1915 
1916 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1917 			   unsigned long new_cl)
1918 {
1919 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1920 	struct tcf_block *block;
1921 	struct tcf_chain *chain;
1922 	unsigned long cl;
1923 
1924 	cl = cops->find(q, portid);
1925 	if (!cl)
1926 		return;
1927 	block = cops->tcf_block(q, cl, NULL);
1928 	if (!block)
1929 		return;
1930 	for (chain = tcf_get_next_chain(block, NULL);
1931 	     chain;
1932 	     chain = tcf_get_next_chain(block, chain)) {
1933 		struct tcf_proto *tp;
1934 
1935 		for (tp = tcf_get_next_proto(chain, NULL, true);
1936 		     tp; tp = tcf_get_next_proto(chain, tp, true)) {
1937 			struct tcf_bind_args arg = {};
1938 
1939 			arg.w.fn = tcf_node_bind;
1940 			arg.classid = clid;
1941 			arg.cl = new_cl;
1942 			tp->ops->walk(tp, &arg.w, true);
1943 		}
1944 	}
1945 }
1946 
1947 #else
1948 
1949 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1950 			   unsigned long new_cl)
1951 {
1952 }
1953 
1954 #endif
1955 
1956 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1957 			 struct netlink_ext_ack *extack)
1958 {
1959 	struct net *net = sock_net(skb->sk);
1960 	struct tcmsg *tcm = nlmsg_data(n);
1961 	struct nlattr *tca[TCA_MAX + 1];
1962 	struct net_device *dev;
1963 	struct Qdisc *q = NULL;
1964 	const struct Qdisc_class_ops *cops;
1965 	unsigned long cl = 0;
1966 	unsigned long new_cl;
1967 	u32 portid;
1968 	u32 clid;
1969 	u32 qid;
1970 	int err;
1971 
1972 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1973 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1974 		return -EPERM;
1975 
1976 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1977 				     rtm_tca_policy, extack);
1978 	if (err < 0)
1979 		return err;
1980 
1981 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1982 	if (!dev)
1983 		return -ENODEV;
1984 
1985 	/*
1986 	   parent == TC_H_UNSPEC - unspecified parent.
1987 	   parent == TC_H_ROOT   - class is root, which has no parent.
1988 	   parent == X:0	 - parent is root class.
1989 	   parent == X:Y	 - parent is a node in hierarchy.
1990 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1991 
1992 	   handle == 0:0	 - generate handle from kernel pool.
1993 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1994 	   handle == X:Y	 - clear.
1995 	   handle == X:0	 - root class.
1996 	 */
1997 
1998 	/* Step 1. Determine qdisc handle X:0 */
1999 
2000 	portid = tcm->tcm_parent;
2001 	clid = tcm->tcm_handle;
2002 	qid = TC_H_MAJ(clid);
2003 
2004 	if (portid != TC_H_ROOT) {
2005 		u32 qid1 = TC_H_MAJ(portid);
2006 
2007 		if (qid && qid1) {
2008 			/* If both majors are known, they must be identical. */
2009 			if (qid != qid1)
2010 				return -EINVAL;
2011 		} else if (qid1) {
2012 			qid = qid1;
2013 		} else if (qid == 0)
2014 			qid = dev->qdisc->handle;
2015 
2016 		/* Now qid is genuine qdisc handle consistent
2017 		 * both with parent and child.
2018 		 *
2019 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2020 		 */
2021 		if (portid)
2022 			portid = TC_H_MAKE(qid, portid);
2023 	} else {
2024 		if (qid == 0)
2025 			qid = dev->qdisc->handle;
2026 	}
2027 
2028 	/* OK. Locate qdisc */
2029 	q = qdisc_lookup(dev, qid);
2030 	if (!q)
2031 		return -ENOENT;
2032 
2033 	/* An check that it supports classes */
2034 	cops = q->ops->cl_ops;
2035 	if (cops == NULL)
2036 		return -EINVAL;
2037 
2038 	/* Now try to get class */
2039 	if (clid == 0) {
2040 		if (portid == TC_H_ROOT)
2041 			clid = qid;
2042 	} else
2043 		clid = TC_H_MAKE(qid, clid);
2044 
2045 	if (clid)
2046 		cl = cops->find(q, clid);
2047 
2048 	if (cl == 0) {
2049 		err = -ENOENT;
2050 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2051 		    !(n->nlmsg_flags & NLM_F_CREATE))
2052 			goto out;
2053 	} else {
2054 		switch (n->nlmsg_type) {
2055 		case RTM_NEWTCLASS:
2056 			err = -EEXIST;
2057 			if (n->nlmsg_flags & NLM_F_EXCL)
2058 				goto out;
2059 			break;
2060 		case RTM_DELTCLASS:
2061 			err = tclass_del_notify(net, cops, skb, n, q, cl);
2062 			/* Unbind the class with flilters with 0 */
2063 			tc_bind_tclass(q, portid, clid, 0);
2064 			goto out;
2065 		case RTM_GETTCLASS:
2066 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2067 			goto out;
2068 		default:
2069 			err = -EINVAL;
2070 			goto out;
2071 		}
2072 	}
2073 
2074 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2075 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2076 		return -EOPNOTSUPP;
2077 	}
2078 
2079 	new_cl = cl;
2080 	err = -EOPNOTSUPP;
2081 	if (cops->change)
2082 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2083 	if (err == 0) {
2084 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2085 		/* We just create a new class, need to do reverse binding. */
2086 		if (cl != new_cl)
2087 			tc_bind_tclass(q, portid, clid, new_cl);
2088 	}
2089 out:
2090 	return err;
2091 }
2092 
2093 struct qdisc_dump_args {
2094 	struct qdisc_walker	w;
2095 	struct sk_buff		*skb;
2096 	struct netlink_callback	*cb;
2097 };
2098 
2099 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2100 			    struct qdisc_walker *arg)
2101 {
2102 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2103 
2104 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2105 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2106 			      RTM_NEWTCLASS);
2107 }
2108 
2109 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2110 				struct tcmsg *tcm, struct netlink_callback *cb,
2111 				int *t_p, int s_t)
2112 {
2113 	struct qdisc_dump_args arg;
2114 
2115 	if (tc_qdisc_dump_ignore(q, false) ||
2116 	    *t_p < s_t || !q->ops->cl_ops ||
2117 	    (tcm->tcm_parent &&
2118 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2119 		(*t_p)++;
2120 		return 0;
2121 	}
2122 	if (*t_p > s_t)
2123 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2124 	arg.w.fn = qdisc_class_dump;
2125 	arg.skb = skb;
2126 	arg.cb = cb;
2127 	arg.w.stop  = 0;
2128 	arg.w.skip = cb->args[1];
2129 	arg.w.count = 0;
2130 	q->ops->cl_ops->walk(q, &arg.w);
2131 	cb->args[1] = arg.w.count;
2132 	if (arg.w.stop)
2133 		return -1;
2134 	(*t_p)++;
2135 	return 0;
2136 }
2137 
2138 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2139 			       struct tcmsg *tcm, struct netlink_callback *cb,
2140 			       int *t_p, int s_t)
2141 {
2142 	struct Qdisc *q;
2143 	int b;
2144 
2145 	if (!root)
2146 		return 0;
2147 
2148 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2149 		return -1;
2150 
2151 	if (!qdisc_dev(root))
2152 		return 0;
2153 
2154 	if (tcm->tcm_parent) {
2155 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2156 		if (q && q != root &&
2157 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2158 			return -1;
2159 		return 0;
2160 	}
2161 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2162 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2163 			return -1;
2164 	}
2165 
2166 	return 0;
2167 }
2168 
2169 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2170 {
2171 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2172 	struct net *net = sock_net(skb->sk);
2173 	struct netdev_queue *dev_queue;
2174 	struct net_device *dev;
2175 	int t, s_t;
2176 
2177 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2178 		return 0;
2179 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2180 	if (!dev)
2181 		return 0;
2182 
2183 	s_t = cb->args[0];
2184 	t = 0;
2185 
2186 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2187 		goto done;
2188 
2189 	dev_queue = dev_ingress_queue(dev);
2190 	if (dev_queue &&
2191 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2192 				&t, s_t) < 0)
2193 		goto done;
2194 
2195 done:
2196 	cb->args[0] = t;
2197 
2198 	dev_put(dev);
2199 	return skb->len;
2200 }
2201 
2202 #ifdef CONFIG_PROC_FS
2203 static int psched_show(struct seq_file *seq, void *v)
2204 {
2205 	seq_printf(seq, "%08x %08x %08x %08x\n",
2206 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2207 		   1000000,
2208 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2209 
2210 	return 0;
2211 }
2212 
2213 static int __net_init psched_net_init(struct net *net)
2214 {
2215 	struct proc_dir_entry *e;
2216 
2217 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2218 	if (e == NULL)
2219 		return -ENOMEM;
2220 
2221 	return 0;
2222 }
2223 
2224 static void __net_exit psched_net_exit(struct net *net)
2225 {
2226 	remove_proc_entry("psched", net->proc_net);
2227 }
2228 #else
2229 static int __net_init psched_net_init(struct net *net)
2230 {
2231 	return 0;
2232 }
2233 
2234 static void __net_exit psched_net_exit(struct net *net)
2235 {
2236 }
2237 #endif
2238 
2239 static struct pernet_operations psched_net_ops = {
2240 	.init = psched_net_init,
2241 	.exit = psched_net_exit,
2242 };
2243 
2244 static int __init pktsched_init(void)
2245 {
2246 	int err;
2247 
2248 	err = register_pernet_subsys(&psched_net_ops);
2249 	if (err) {
2250 		pr_err("pktsched_init: "
2251 		       "cannot initialize per netns operations\n");
2252 		return err;
2253 	}
2254 
2255 	register_qdisc(&pfifo_fast_ops);
2256 	register_qdisc(&pfifo_qdisc_ops);
2257 	register_qdisc(&bfifo_qdisc_ops);
2258 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2259 	register_qdisc(&mq_qdisc_ops);
2260 	register_qdisc(&noqueue_qdisc_ops);
2261 
2262 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2263 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2264 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2265 		      0);
2266 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2267 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2268 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2269 		      0);
2270 
2271 	return 0;
2272 }
2273 
2274 subsys_initcall(pktsched_init);
2275