xref: /openbmc/linux/net/sched/sch_api.c (revision dd5b2498)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/slab.h>
31 #include <linux/hashtable.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 #include <net/pkt_cls.h>
38 
39 /*
40 
41    Short review.
42    -------------
43 
44    This file consists of two interrelated parts:
45 
46    1. queueing disciplines manager frontend.
47    2. traffic classes manager frontend.
48 
49    Generally, queueing discipline ("qdisc") is a black box,
50    which is able to enqueue packets and to dequeue them (when
51    device is ready to send something) in order and at times
52    determined by algorithm hidden in it.
53 
54    qdisc's are divided to two categories:
55    - "queues", which have no internal structure visible from outside.
56    - "schedulers", which split all the packets to "traffic classes",
57      using "packet classifiers" (look at cls_api.c)
58 
59    In turn, classes may have child qdiscs (as rule, queues)
60    attached to them etc. etc. etc.
61 
62    The goal of the routines in this file is to translate
63    information supplied by user in the form of handles
64    to more intelligible for kernel form, to make some sanity
65    checks and part of work, which is common to all qdiscs
66    and to provide rtnetlink notifications.
67 
68    All real intelligent work is done inside qdisc modules.
69 
70 
71 
72    Every discipline has two major routines: enqueue and dequeue.
73 
74    ---dequeue
75 
76    dequeue usually returns a skb to send. It is allowed to return NULL,
77    but it does not mean that queue is empty, it just means that
78    discipline does not want to send anything this time.
79    Queue is really empty if q->q.qlen == 0.
80    For complicated disciplines with multiple queues q->q is not
81    real packet queue, but however q->q.qlen must be valid.
82 
83    ---enqueue
84 
85    enqueue returns 0, if packet was enqueued successfully.
86    If packet (this one or another one) was dropped, it returns
87    not zero error code.
88    NET_XMIT_DROP 	- this packet dropped
89      Expected action: do not backoff, but wait until queue will clear.
90    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
91      Expected action: backoff or ignore
92 
93    Auxiliary routines:
94 
95    ---peek
96 
97    like dequeue but without removing a packet from the queue
98 
99    ---reset
100 
101    returns qdisc to initial state: purge all buffers, clear all
102    timers, counters (except for statistics) etc.
103 
104    ---init
105 
106    initializes newly created qdisc.
107 
108    ---destroy
109 
110    destroys resources allocated by init and during lifetime of qdisc.
111 
112    ---change
113 
114    changes qdisc parameters.
115  */
116 
117 /* Protects list of registered TC modules. It is pure SMP lock. */
118 static DEFINE_RWLOCK(qdisc_mod_lock);
119 
120 
121 /************************************************
122  *	Queueing disciplines manipulation.	*
123  ************************************************/
124 
125 
126 /* The list of all installed queueing disciplines. */
127 
128 static struct Qdisc_ops *qdisc_base;
129 
130 /* Register/unregister queueing discipline */
131 
132 int register_qdisc(struct Qdisc_ops *qops)
133 {
134 	struct Qdisc_ops *q, **qp;
135 	int rc = -EEXIST;
136 
137 	write_lock(&qdisc_mod_lock);
138 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
139 		if (!strcmp(qops->id, q->id))
140 			goto out;
141 
142 	if (qops->enqueue == NULL)
143 		qops->enqueue = noop_qdisc_ops.enqueue;
144 	if (qops->peek == NULL) {
145 		if (qops->dequeue == NULL)
146 			qops->peek = noop_qdisc_ops.peek;
147 		else
148 			goto out_einval;
149 	}
150 	if (qops->dequeue == NULL)
151 		qops->dequeue = noop_qdisc_ops.dequeue;
152 
153 	if (qops->cl_ops) {
154 		const struct Qdisc_class_ops *cops = qops->cl_ops;
155 
156 		if (!(cops->find && cops->walk && cops->leaf))
157 			goto out_einval;
158 
159 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
160 			goto out_einval;
161 	}
162 
163 	qops->next = NULL;
164 	*qp = qops;
165 	rc = 0;
166 out:
167 	write_unlock(&qdisc_mod_lock);
168 	return rc;
169 
170 out_einval:
171 	rc = -EINVAL;
172 	goto out;
173 }
174 EXPORT_SYMBOL(register_qdisc);
175 
176 int unregister_qdisc(struct Qdisc_ops *qops)
177 {
178 	struct Qdisc_ops *q, **qp;
179 	int err = -ENOENT;
180 
181 	write_lock(&qdisc_mod_lock);
182 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
183 		if (q == qops)
184 			break;
185 	if (q) {
186 		*qp = q->next;
187 		q->next = NULL;
188 		err = 0;
189 	}
190 	write_unlock(&qdisc_mod_lock);
191 	return err;
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strlcpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module("sch_%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
273 		if (q->handle == handle)
274 			return q;
275 	}
276 	return NULL;
277 }
278 
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282 		ASSERT_RTNL();
283 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284 		if (invisible)
285 			q->flags |= TCQ_F_INVISIBLE;
286 	}
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289 
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293 		ASSERT_RTNL();
294 		hash_del_rcu(&q->hash);
295 	}
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298 
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301 	struct Qdisc *q;
302 
303 	if (!handle)
304 		return NULL;
305 	q = qdisc_match_from_root(dev->qdisc, handle);
306 	if (q)
307 		goto out;
308 
309 	if (dev_ingress_queue(dev))
310 		q = qdisc_match_from_root(
311 			dev_ingress_queue(dev)->qdisc_sleeping,
312 			handle);
313 out:
314 	return q;
315 }
316 
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319 	struct netdev_queue *nq;
320 	struct Qdisc *q;
321 
322 	if (!handle)
323 		return NULL;
324 	q = qdisc_match_from_root(dev->qdisc, handle);
325 	if (q)
326 		goto out;
327 
328 	nq = dev_ingress_queue_rcu(dev);
329 	if (nq)
330 		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332 	return q;
333 }
334 
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337 	unsigned long cl;
338 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339 
340 	if (cops == NULL)
341 		return NULL;
342 	cl = cops->find(p, classid);
343 
344 	if (cl == 0)
345 		return NULL;
346 	return cops->leaf(p, cl);
347 }
348 
349 /* Find queueing discipline by name */
350 
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353 	struct Qdisc_ops *q = NULL;
354 
355 	if (kind) {
356 		read_lock(&qdisc_mod_lock);
357 		for (q = qdisc_base; q; q = q->next) {
358 			if (nla_strcmp(kind, q->id) == 0) {
359 				if (!try_module_get(q->owner))
360 					q = NULL;
361 				break;
362 			}
363 		}
364 		read_unlock(&qdisc_mod_lock);
365 	}
366 	return q;
367 }
368 
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388 	int low       = roundup(r->mpu, 48);
389 	int high      = roundup(low+1, 48);
390 	int cell_low  = low >> r->cell_log;
391 	int cell_high = (high >> r->cell_log) - 1;
392 
393 	/* rtab is too inaccurate at rates > 100Mbit/s */
394 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395 		pr_debug("TC linklayer: Giving up ATM detection\n");
396 		return TC_LINKLAYER_ETHERNET;
397 	}
398 
399 	if ((cell_high > cell_low) && (cell_high < 256)
400 	    && (rtab[cell_low] == rtab[cell_high])) {
401 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402 			 cell_low, cell_high, rtab[cell_high]);
403 		return TC_LINKLAYER_ATM;
404 	}
405 	return TC_LINKLAYER_ETHERNET;
406 }
407 
408 static struct qdisc_rate_table *qdisc_rtab_list;
409 
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411 					struct nlattr *tab,
412 					struct netlink_ext_ack *extack)
413 {
414 	struct qdisc_rate_table *rtab;
415 
416 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
417 	    nla_len(tab) != TC_RTAB_SIZE) {
418 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419 		return NULL;
420 	}
421 
422 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
425 			rtab->refcnt++;
426 			return rtab;
427 		}
428 	}
429 
430 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431 	if (rtab) {
432 		rtab->rate = *r;
433 		rtab->refcnt = 1;
434 		memcpy(rtab->data, nla_data(tab), 1024);
435 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
436 			r->linklayer = __detect_linklayer(r, rtab->data);
437 		rtab->next = qdisc_rtab_list;
438 		qdisc_rtab_list = rtab;
439 	} else {
440 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441 	}
442 	return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445 
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448 	struct qdisc_rate_table *rtab, **rtabp;
449 
450 	if (!tab || --tab->refcnt)
451 		return;
452 
453 	for (rtabp = &qdisc_rtab_list;
454 	     (rtab = *rtabp) != NULL;
455 	     rtabp = &rtab->next) {
456 		if (rtab == tab) {
457 			*rtabp = rtab->next;
458 			kfree(rtab);
459 			return;
460 		}
461 	}
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464 
465 static LIST_HEAD(qdisc_stab_list);
466 
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
469 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471 
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473 					       struct netlink_ext_ack *extack)
474 {
475 	struct nlattr *tb[TCA_STAB_MAX + 1];
476 	struct qdisc_size_table *stab;
477 	struct tc_sizespec *s;
478 	unsigned int tsize = 0;
479 	u16 *tab = NULL;
480 	int err;
481 
482 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
483 	if (err < 0)
484 		return ERR_PTR(err);
485 	if (!tb[TCA_STAB_BASE]) {
486 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
487 		return ERR_PTR(-EINVAL);
488 	}
489 
490 	s = nla_data(tb[TCA_STAB_BASE]);
491 
492 	if (s->tsize > 0) {
493 		if (!tb[TCA_STAB_DATA]) {
494 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
495 			return ERR_PTR(-EINVAL);
496 		}
497 		tab = nla_data(tb[TCA_STAB_DATA]);
498 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
499 	}
500 
501 	if (tsize != s->tsize || (!tab && tsize > 0)) {
502 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
503 		return ERR_PTR(-EINVAL);
504 	}
505 
506 	list_for_each_entry(stab, &qdisc_stab_list, list) {
507 		if (memcmp(&stab->szopts, s, sizeof(*s)))
508 			continue;
509 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
510 			continue;
511 		stab->refcnt++;
512 		return stab;
513 	}
514 
515 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
516 	if (!stab)
517 		return ERR_PTR(-ENOMEM);
518 
519 	stab->refcnt = 1;
520 	stab->szopts = *s;
521 	if (tsize > 0)
522 		memcpy(stab->data, tab, tsize * sizeof(u16));
523 
524 	list_add_tail(&stab->list, &qdisc_stab_list);
525 
526 	return stab;
527 }
528 
529 void qdisc_put_stab(struct qdisc_size_table *tab)
530 {
531 	if (!tab)
532 		return;
533 
534 	if (--tab->refcnt == 0) {
535 		list_del(&tab->list);
536 		kfree_rcu(tab, rcu);
537 	}
538 }
539 EXPORT_SYMBOL(qdisc_put_stab);
540 
541 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
542 {
543 	struct nlattr *nest;
544 
545 	nest = nla_nest_start(skb, TCA_STAB);
546 	if (nest == NULL)
547 		goto nla_put_failure;
548 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
549 		goto nla_put_failure;
550 	nla_nest_end(skb, nest);
551 
552 	return skb->len;
553 
554 nla_put_failure:
555 	return -1;
556 }
557 
558 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
559 			       const struct qdisc_size_table *stab)
560 {
561 	int pkt_len, slot;
562 
563 	pkt_len = skb->len + stab->szopts.overhead;
564 	if (unlikely(!stab->szopts.tsize))
565 		goto out;
566 
567 	slot = pkt_len + stab->szopts.cell_align;
568 	if (unlikely(slot < 0))
569 		slot = 0;
570 
571 	slot >>= stab->szopts.cell_log;
572 	if (likely(slot < stab->szopts.tsize))
573 		pkt_len = stab->data[slot];
574 	else
575 		pkt_len = stab->data[stab->szopts.tsize - 1] *
576 				(slot / stab->szopts.tsize) +
577 				stab->data[slot % stab->szopts.tsize];
578 
579 	pkt_len <<= stab->szopts.size_log;
580 out:
581 	if (unlikely(pkt_len < 1))
582 		pkt_len = 1;
583 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
584 }
585 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
586 
587 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
588 {
589 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
590 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
591 			txt, qdisc->ops->id, qdisc->handle >> 16);
592 		qdisc->flags |= TCQ_F_WARN_NONWC;
593 	}
594 }
595 EXPORT_SYMBOL(qdisc_warn_nonwc);
596 
597 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
598 {
599 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
600 						 timer);
601 
602 	rcu_read_lock();
603 	__netif_schedule(qdisc_root(wd->qdisc));
604 	rcu_read_unlock();
605 
606 	return HRTIMER_NORESTART;
607 }
608 
609 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
610 				 clockid_t clockid)
611 {
612 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
613 	wd->timer.function = qdisc_watchdog;
614 	wd->qdisc = qdisc;
615 }
616 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
617 
618 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
619 {
620 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
621 }
622 EXPORT_SYMBOL(qdisc_watchdog_init);
623 
624 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
625 {
626 	if (test_bit(__QDISC_STATE_DEACTIVATED,
627 		     &qdisc_root_sleeping(wd->qdisc)->state))
628 		return;
629 
630 	if (wd->last_expires == expires)
631 		return;
632 
633 	wd->last_expires = expires;
634 	hrtimer_start(&wd->timer,
635 		      ns_to_ktime(expires),
636 		      HRTIMER_MODE_ABS_PINNED);
637 }
638 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
639 
640 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
641 {
642 	hrtimer_cancel(&wd->timer);
643 }
644 EXPORT_SYMBOL(qdisc_watchdog_cancel);
645 
646 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
647 {
648 	struct hlist_head *h;
649 	unsigned int i;
650 
651 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
652 
653 	if (h != NULL) {
654 		for (i = 0; i < n; i++)
655 			INIT_HLIST_HEAD(&h[i]);
656 	}
657 	return h;
658 }
659 
660 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
661 {
662 	struct Qdisc_class_common *cl;
663 	struct hlist_node *next;
664 	struct hlist_head *nhash, *ohash;
665 	unsigned int nsize, nmask, osize;
666 	unsigned int i, h;
667 
668 	/* Rehash when load factor exceeds 0.75 */
669 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
670 		return;
671 	nsize = clhash->hashsize * 2;
672 	nmask = nsize - 1;
673 	nhash = qdisc_class_hash_alloc(nsize);
674 	if (nhash == NULL)
675 		return;
676 
677 	ohash = clhash->hash;
678 	osize = clhash->hashsize;
679 
680 	sch_tree_lock(sch);
681 	for (i = 0; i < osize; i++) {
682 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
683 			h = qdisc_class_hash(cl->classid, nmask);
684 			hlist_add_head(&cl->hnode, &nhash[h]);
685 		}
686 	}
687 	clhash->hash     = nhash;
688 	clhash->hashsize = nsize;
689 	clhash->hashmask = nmask;
690 	sch_tree_unlock(sch);
691 
692 	kvfree(ohash);
693 }
694 EXPORT_SYMBOL(qdisc_class_hash_grow);
695 
696 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
697 {
698 	unsigned int size = 4;
699 
700 	clhash->hash = qdisc_class_hash_alloc(size);
701 	if (!clhash->hash)
702 		return -ENOMEM;
703 	clhash->hashsize  = size;
704 	clhash->hashmask  = size - 1;
705 	clhash->hashelems = 0;
706 	return 0;
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_init);
709 
710 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
711 {
712 	kvfree(clhash->hash);
713 }
714 EXPORT_SYMBOL(qdisc_class_hash_destroy);
715 
716 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
717 			     struct Qdisc_class_common *cl)
718 {
719 	unsigned int h;
720 
721 	INIT_HLIST_NODE(&cl->hnode);
722 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
723 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
724 	clhash->hashelems++;
725 }
726 EXPORT_SYMBOL(qdisc_class_hash_insert);
727 
728 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
729 			     struct Qdisc_class_common *cl)
730 {
731 	hlist_del(&cl->hnode);
732 	clhash->hashelems--;
733 }
734 EXPORT_SYMBOL(qdisc_class_hash_remove);
735 
736 /* Allocate an unique handle from space managed by kernel
737  * Possible range is [8000-FFFF]:0000 (0x8000 values)
738  */
739 static u32 qdisc_alloc_handle(struct net_device *dev)
740 {
741 	int i = 0x8000;
742 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
743 
744 	do {
745 		autohandle += TC_H_MAKE(0x10000U, 0);
746 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
747 			autohandle = TC_H_MAKE(0x80000000U, 0);
748 		if (!qdisc_lookup(dev, autohandle))
749 			return autohandle;
750 		cond_resched();
751 	} while	(--i > 0);
752 
753 	return 0;
754 }
755 
756 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
757 {
758 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
759 	const struct Qdisc_class_ops *cops;
760 	unsigned long cl;
761 	u32 parentid;
762 	bool notify;
763 	int drops;
764 
765 	if (n == 0 && len == 0)
766 		return;
767 	drops = max_t(int, n, 0);
768 	rcu_read_lock();
769 	while ((parentid = sch->parent)) {
770 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
771 			break;
772 
773 		if (sch->flags & TCQ_F_NOPARENT)
774 			break;
775 		/* Notify parent qdisc only if child qdisc becomes empty.
776 		 *
777 		 * If child was empty even before update then backlog
778 		 * counter is screwed and we skip notification because
779 		 * parent class is already passive.
780 		 *
781 		 * If the original child was offloaded then it is allowed
782 		 * to be seem as empty, so the parent is notified anyway.
783 		 */
784 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
785 						       !qdisc_is_offloaded);
786 		/* TODO: perform the search on a per txq basis */
787 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
788 		if (sch == NULL) {
789 			WARN_ON_ONCE(parentid != TC_H_ROOT);
790 			break;
791 		}
792 		cops = sch->ops->cl_ops;
793 		if (notify && cops->qlen_notify) {
794 			cl = cops->find(sch, parentid);
795 			cops->qlen_notify(sch, cl);
796 		}
797 		sch->q.qlen -= n;
798 		sch->qstats.backlog -= len;
799 		__qdisc_qstats_drop(sch, drops);
800 	}
801 	rcu_read_unlock();
802 }
803 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
804 
805 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
806 			      void *type_data)
807 {
808 	struct net_device *dev = qdisc_dev(sch);
809 	int err;
810 
811 	sch->flags &= ~TCQ_F_OFFLOADED;
812 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
813 		return 0;
814 
815 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
816 	if (err == -EOPNOTSUPP)
817 		return 0;
818 
819 	if (!err)
820 		sch->flags |= TCQ_F_OFFLOADED;
821 
822 	return err;
823 }
824 EXPORT_SYMBOL(qdisc_offload_dump_helper);
825 
826 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
827 				struct Qdisc *new, struct Qdisc *old,
828 				enum tc_setup_type type, void *type_data,
829 				struct netlink_ext_ack *extack)
830 {
831 	bool any_qdisc_is_offloaded;
832 	int err;
833 
834 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
835 		return;
836 
837 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838 
839 	/* Don't report error if the graft is part of destroy operation. */
840 	if (!err || !new || new == &noop_qdisc)
841 		return;
842 
843 	/* Don't report error if the parent, the old child and the new
844 	 * one are not offloaded.
845 	 */
846 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
847 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
848 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
849 
850 	if (any_qdisc_is_offloaded)
851 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
852 }
853 EXPORT_SYMBOL(qdisc_offload_graft_helper);
854 
855 static void qdisc_offload_graft_root(struct net_device *dev,
856 				     struct Qdisc *new, struct Qdisc *old,
857 				     struct netlink_ext_ack *extack)
858 {
859 	struct tc_root_qopt_offload graft_offload = {
860 		.command	= TC_ROOT_GRAFT,
861 		.handle		= new ? new->handle : 0,
862 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
863 				  (old && old->flags & TCQ_F_INGRESS),
864 	};
865 
866 	qdisc_offload_graft_helper(dev, NULL, new, old,
867 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
868 }
869 
870 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
871 			 u32 portid, u32 seq, u16 flags, int event)
872 {
873 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
874 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
875 	struct tcmsg *tcm;
876 	struct nlmsghdr  *nlh;
877 	unsigned char *b = skb_tail_pointer(skb);
878 	struct gnet_dump d;
879 	struct qdisc_size_table *stab;
880 	u32 block_index;
881 	__u32 qlen;
882 
883 	cond_resched();
884 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
885 	if (!nlh)
886 		goto out_nlmsg_trim;
887 	tcm = nlmsg_data(nlh);
888 	tcm->tcm_family = AF_UNSPEC;
889 	tcm->tcm__pad1 = 0;
890 	tcm->tcm__pad2 = 0;
891 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
892 	tcm->tcm_parent = clid;
893 	tcm->tcm_handle = q->handle;
894 	tcm->tcm_info = refcount_read(&q->refcnt);
895 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
896 		goto nla_put_failure;
897 	if (q->ops->ingress_block_get) {
898 		block_index = q->ops->ingress_block_get(q);
899 		if (block_index &&
900 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
901 			goto nla_put_failure;
902 	}
903 	if (q->ops->egress_block_get) {
904 		block_index = q->ops->egress_block_get(q);
905 		if (block_index &&
906 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
907 			goto nla_put_failure;
908 	}
909 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
910 		goto nla_put_failure;
911 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
912 		goto nla_put_failure;
913 	qlen = qdisc_qlen_sum(q);
914 
915 	stab = rtnl_dereference(q->stab);
916 	if (stab && qdisc_dump_stab(skb, stab) < 0)
917 		goto nla_put_failure;
918 
919 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
920 					 NULL, &d, TCA_PAD) < 0)
921 		goto nla_put_failure;
922 
923 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
924 		goto nla_put_failure;
925 
926 	if (qdisc_is_percpu_stats(q)) {
927 		cpu_bstats = q->cpu_bstats;
928 		cpu_qstats = q->cpu_qstats;
929 	}
930 
931 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
932 				  &d, cpu_bstats, &q->bstats) < 0 ||
933 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
934 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
935 		goto nla_put_failure;
936 
937 	if (gnet_stats_finish_copy(&d) < 0)
938 		goto nla_put_failure;
939 
940 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
941 	return skb->len;
942 
943 out_nlmsg_trim:
944 nla_put_failure:
945 	nlmsg_trim(skb, b);
946 	return -1;
947 }
948 
949 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
950 {
951 	if (q->flags & TCQ_F_BUILTIN)
952 		return true;
953 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
954 		return true;
955 
956 	return false;
957 }
958 
959 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
960 			struct nlmsghdr *n, u32 clid,
961 			struct Qdisc *old, struct Qdisc *new)
962 {
963 	struct sk_buff *skb;
964 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
965 
966 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
967 	if (!skb)
968 		return -ENOBUFS;
969 
970 	if (old && !tc_qdisc_dump_ignore(old, false)) {
971 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
972 				  0, RTM_DELQDISC) < 0)
973 			goto err_out;
974 	}
975 	if (new && !tc_qdisc_dump_ignore(new, false)) {
976 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
977 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
978 			goto err_out;
979 	}
980 
981 	if (skb->len)
982 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
983 				      n->nlmsg_flags & NLM_F_ECHO);
984 
985 err_out:
986 	kfree_skb(skb);
987 	return -EINVAL;
988 }
989 
990 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
991 			       struct nlmsghdr *n, u32 clid,
992 			       struct Qdisc *old, struct Qdisc *new)
993 {
994 	if (new || old)
995 		qdisc_notify(net, skb, n, clid, old, new);
996 
997 	if (old)
998 		qdisc_put(old);
999 }
1000 
1001 static void qdisc_clear_nolock(struct Qdisc *sch)
1002 {
1003 	sch->flags &= ~TCQ_F_NOLOCK;
1004 	if (!(sch->flags & TCQ_F_CPUSTATS))
1005 		return;
1006 
1007 	free_percpu(sch->cpu_bstats);
1008 	free_percpu(sch->cpu_qstats);
1009 	sch->cpu_bstats = NULL;
1010 	sch->cpu_qstats = NULL;
1011 	sch->flags &= ~TCQ_F_CPUSTATS;
1012 }
1013 
1014 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1015  * to device "dev".
1016  *
1017  * When appropriate send a netlink notification using 'skb'
1018  * and "n".
1019  *
1020  * On success, destroy old qdisc.
1021  */
1022 
1023 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1024 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1025 		       struct Qdisc *new, struct Qdisc *old,
1026 		       struct netlink_ext_ack *extack)
1027 {
1028 	struct Qdisc *q = old;
1029 	struct net *net = dev_net(dev);
1030 
1031 	if (parent == NULL) {
1032 		unsigned int i, num_q, ingress;
1033 
1034 		ingress = 0;
1035 		num_q = dev->num_tx_queues;
1036 		if ((q && q->flags & TCQ_F_INGRESS) ||
1037 		    (new && new->flags & TCQ_F_INGRESS)) {
1038 			num_q = 1;
1039 			ingress = 1;
1040 			if (!dev_ingress_queue(dev)) {
1041 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1042 				return -ENOENT;
1043 			}
1044 		}
1045 
1046 		if (dev->flags & IFF_UP)
1047 			dev_deactivate(dev);
1048 
1049 		qdisc_offload_graft_root(dev, new, old, extack);
1050 
1051 		if (new && new->ops->attach)
1052 			goto skip;
1053 
1054 		for (i = 0; i < num_q; i++) {
1055 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1056 
1057 			if (!ingress)
1058 				dev_queue = netdev_get_tx_queue(dev, i);
1059 
1060 			old = dev_graft_qdisc(dev_queue, new);
1061 			if (new && i > 0)
1062 				qdisc_refcount_inc(new);
1063 
1064 			if (!ingress)
1065 				qdisc_put(old);
1066 		}
1067 
1068 skip:
1069 		if (!ingress) {
1070 			notify_and_destroy(net, skb, n, classid,
1071 					   dev->qdisc, new);
1072 			if (new && !new->ops->attach)
1073 				qdisc_refcount_inc(new);
1074 			dev->qdisc = new ? : &noop_qdisc;
1075 
1076 			if (new && new->ops->attach)
1077 				new->ops->attach(new);
1078 		} else {
1079 			notify_and_destroy(net, skb, n, classid, old, new);
1080 		}
1081 
1082 		if (dev->flags & IFF_UP)
1083 			dev_activate(dev);
1084 	} else {
1085 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1086 		unsigned long cl;
1087 		int err;
1088 
1089 		/* Only support running class lockless if parent is lockless */
1090 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1091 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1092 			qdisc_clear_nolock(new);
1093 
1094 		if (!cops || !cops->graft)
1095 			return -EOPNOTSUPP;
1096 
1097 		cl = cops->find(parent, classid);
1098 		if (!cl) {
1099 			NL_SET_ERR_MSG(extack, "Specified class not found");
1100 			return -ENOENT;
1101 		}
1102 
1103 		err = cops->graft(parent, cl, new, &old, extack);
1104 		if (err)
1105 			return err;
1106 		notify_and_destroy(net, skb, n, classid, old, new);
1107 	}
1108 	return 0;
1109 }
1110 
1111 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1112 				   struct netlink_ext_ack *extack)
1113 {
1114 	u32 block_index;
1115 
1116 	if (tca[TCA_INGRESS_BLOCK]) {
1117 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1118 
1119 		if (!block_index) {
1120 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1121 			return -EINVAL;
1122 		}
1123 		if (!sch->ops->ingress_block_set) {
1124 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1125 			return -EOPNOTSUPP;
1126 		}
1127 		sch->ops->ingress_block_set(sch, block_index);
1128 	}
1129 	if (tca[TCA_EGRESS_BLOCK]) {
1130 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1131 
1132 		if (!block_index) {
1133 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1134 			return -EINVAL;
1135 		}
1136 		if (!sch->ops->egress_block_set) {
1137 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1138 			return -EOPNOTSUPP;
1139 		}
1140 		sch->ops->egress_block_set(sch, block_index);
1141 	}
1142 	return 0;
1143 }
1144 
1145 /*
1146    Allocate and initialize new qdisc.
1147 
1148    Parameters are passed via opt.
1149  */
1150 
1151 static struct Qdisc *qdisc_create(struct net_device *dev,
1152 				  struct netdev_queue *dev_queue,
1153 				  struct Qdisc *p, u32 parent, u32 handle,
1154 				  struct nlattr **tca, int *errp,
1155 				  struct netlink_ext_ack *extack)
1156 {
1157 	int err;
1158 	struct nlattr *kind = tca[TCA_KIND];
1159 	struct Qdisc *sch;
1160 	struct Qdisc_ops *ops;
1161 	struct qdisc_size_table *stab;
1162 
1163 	ops = qdisc_lookup_ops(kind);
1164 #ifdef CONFIG_MODULES
1165 	if (ops == NULL && kind != NULL) {
1166 		char name[IFNAMSIZ];
1167 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1168 			/* We dropped the RTNL semaphore in order to
1169 			 * perform the module load.  So, even if we
1170 			 * succeeded in loading the module we have to
1171 			 * tell the caller to replay the request.  We
1172 			 * indicate this using -EAGAIN.
1173 			 * We replay the request because the device may
1174 			 * go away in the mean time.
1175 			 */
1176 			rtnl_unlock();
1177 			request_module("sch_%s", name);
1178 			rtnl_lock();
1179 			ops = qdisc_lookup_ops(kind);
1180 			if (ops != NULL) {
1181 				/* We will try again qdisc_lookup_ops,
1182 				 * so don't keep a reference.
1183 				 */
1184 				module_put(ops->owner);
1185 				err = -EAGAIN;
1186 				goto err_out;
1187 			}
1188 		}
1189 	}
1190 #endif
1191 
1192 	err = -ENOENT;
1193 	if (!ops) {
1194 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1195 		goto err_out;
1196 	}
1197 
1198 	sch = qdisc_alloc(dev_queue, ops, extack);
1199 	if (IS_ERR(sch)) {
1200 		err = PTR_ERR(sch);
1201 		goto err_out2;
1202 	}
1203 
1204 	sch->parent = parent;
1205 
1206 	if (handle == TC_H_INGRESS) {
1207 		sch->flags |= TCQ_F_INGRESS;
1208 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1209 	} else {
1210 		if (handle == 0) {
1211 			handle = qdisc_alloc_handle(dev);
1212 			if (handle == 0) {
1213 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1214 				err = -ENOSPC;
1215 				goto err_out3;
1216 			}
1217 		}
1218 		if (!netif_is_multiqueue(dev))
1219 			sch->flags |= TCQ_F_ONETXQUEUE;
1220 	}
1221 
1222 	sch->handle = handle;
1223 
1224 	/* This exist to keep backward compatible with a userspace
1225 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1226 	 * facility on older kernels by setting tx_queue_len=0 (prior
1227 	 * to qdisc init), and then forgot to reinit tx_queue_len
1228 	 * before again attaching a qdisc.
1229 	 */
1230 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1231 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1232 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1233 	}
1234 
1235 	err = qdisc_block_indexes_set(sch, tca, extack);
1236 	if (err)
1237 		goto err_out3;
1238 
1239 	if (ops->init) {
1240 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1241 		if (err != 0)
1242 			goto err_out5;
1243 	}
1244 
1245 	if (tca[TCA_STAB]) {
1246 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1247 		if (IS_ERR(stab)) {
1248 			err = PTR_ERR(stab);
1249 			goto err_out4;
1250 		}
1251 		rcu_assign_pointer(sch->stab, stab);
1252 	}
1253 	if (tca[TCA_RATE]) {
1254 		seqcount_t *running;
1255 
1256 		err = -EOPNOTSUPP;
1257 		if (sch->flags & TCQ_F_MQROOT) {
1258 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1259 			goto err_out4;
1260 		}
1261 
1262 		if (sch->parent != TC_H_ROOT &&
1263 		    !(sch->flags & TCQ_F_INGRESS) &&
1264 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1265 			running = qdisc_root_sleeping_running(sch);
1266 		else
1267 			running = &sch->running;
1268 
1269 		err = gen_new_estimator(&sch->bstats,
1270 					sch->cpu_bstats,
1271 					&sch->rate_est,
1272 					NULL,
1273 					running,
1274 					tca[TCA_RATE]);
1275 		if (err) {
1276 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1277 			goto err_out4;
1278 		}
1279 	}
1280 
1281 	qdisc_hash_add(sch, false);
1282 
1283 	return sch;
1284 
1285 err_out5:
1286 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1287 	if (ops->destroy)
1288 		ops->destroy(sch);
1289 err_out3:
1290 	dev_put(dev);
1291 	qdisc_free(sch);
1292 err_out2:
1293 	module_put(ops->owner);
1294 err_out:
1295 	*errp = err;
1296 	return NULL;
1297 
1298 err_out4:
1299 	/*
1300 	 * Any broken qdiscs that would require a ops->reset() here?
1301 	 * The qdisc was never in action so it shouldn't be necessary.
1302 	 */
1303 	qdisc_put_stab(rtnl_dereference(sch->stab));
1304 	if (ops->destroy)
1305 		ops->destroy(sch);
1306 	goto err_out3;
1307 }
1308 
1309 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1310 			struct netlink_ext_ack *extack)
1311 {
1312 	struct qdisc_size_table *ostab, *stab = NULL;
1313 	int err = 0;
1314 
1315 	if (tca[TCA_OPTIONS]) {
1316 		if (!sch->ops->change) {
1317 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1318 			return -EINVAL;
1319 		}
1320 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1321 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1322 			return -EOPNOTSUPP;
1323 		}
1324 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1325 		if (err)
1326 			return err;
1327 	}
1328 
1329 	if (tca[TCA_STAB]) {
1330 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1331 		if (IS_ERR(stab))
1332 			return PTR_ERR(stab);
1333 	}
1334 
1335 	ostab = rtnl_dereference(sch->stab);
1336 	rcu_assign_pointer(sch->stab, stab);
1337 	qdisc_put_stab(ostab);
1338 
1339 	if (tca[TCA_RATE]) {
1340 		/* NB: ignores errors from replace_estimator
1341 		   because change can't be undone. */
1342 		if (sch->flags & TCQ_F_MQROOT)
1343 			goto out;
1344 		gen_replace_estimator(&sch->bstats,
1345 				      sch->cpu_bstats,
1346 				      &sch->rate_est,
1347 				      NULL,
1348 				      qdisc_root_sleeping_running(sch),
1349 				      tca[TCA_RATE]);
1350 	}
1351 out:
1352 	return 0;
1353 }
1354 
1355 struct check_loop_arg {
1356 	struct qdisc_walker	w;
1357 	struct Qdisc		*p;
1358 	int			depth;
1359 };
1360 
1361 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1362 			 struct qdisc_walker *w);
1363 
1364 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1365 {
1366 	struct check_loop_arg	arg;
1367 
1368 	if (q->ops->cl_ops == NULL)
1369 		return 0;
1370 
1371 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1372 	arg.w.fn = check_loop_fn;
1373 	arg.depth = depth;
1374 	arg.p = p;
1375 	q->ops->cl_ops->walk(q, &arg.w);
1376 	return arg.w.stop ? -ELOOP : 0;
1377 }
1378 
1379 static int
1380 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1381 {
1382 	struct Qdisc *leaf;
1383 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1384 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1385 
1386 	leaf = cops->leaf(q, cl);
1387 	if (leaf) {
1388 		if (leaf == arg->p || arg->depth > 7)
1389 			return -ELOOP;
1390 		return check_loop(leaf, arg->p, arg->depth + 1);
1391 	}
1392 	return 0;
1393 }
1394 
1395 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1396 	[TCA_KIND]		= { .type = NLA_STRING },
1397 	[TCA_RATE]		= { .type = NLA_BINARY,
1398 				    .len = sizeof(struct tc_estimator) },
1399 	[TCA_STAB]		= { .type = NLA_NESTED },
1400 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1401 	[TCA_CHAIN]		= { .type = NLA_U32 },
1402 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1403 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1404 };
1405 
1406 /*
1407  * Delete/get qdisc.
1408  */
1409 
1410 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1411 			struct netlink_ext_ack *extack)
1412 {
1413 	struct net *net = sock_net(skb->sk);
1414 	struct tcmsg *tcm = nlmsg_data(n);
1415 	struct nlattr *tca[TCA_MAX + 1];
1416 	struct net_device *dev;
1417 	u32 clid;
1418 	struct Qdisc *q = NULL;
1419 	struct Qdisc *p = NULL;
1420 	int err;
1421 
1422 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1423 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1424 		return -EPERM;
1425 
1426 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1427 			  extack);
1428 	if (err < 0)
1429 		return err;
1430 
1431 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1432 	if (!dev)
1433 		return -ENODEV;
1434 
1435 	clid = tcm->tcm_parent;
1436 	if (clid) {
1437 		if (clid != TC_H_ROOT) {
1438 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1439 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1440 				if (!p) {
1441 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1442 					return -ENOENT;
1443 				}
1444 				q = qdisc_leaf(p, clid);
1445 			} else if (dev_ingress_queue(dev)) {
1446 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1447 			}
1448 		} else {
1449 			q = dev->qdisc;
1450 		}
1451 		if (!q) {
1452 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1453 			return -ENOENT;
1454 		}
1455 
1456 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1457 			NL_SET_ERR_MSG(extack, "Invalid handle");
1458 			return -EINVAL;
1459 		}
1460 	} else {
1461 		q = qdisc_lookup(dev, tcm->tcm_handle);
1462 		if (!q) {
1463 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1464 			return -ENOENT;
1465 		}
1466 	}
1467 
1468 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1469 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1470 		return -EINVAL;
1471 	}
1472 
1473 	if (n->nlmsg_type == RTM_DELQDISC) {
1474 		if (!clid) {
1475 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1476 			return -EINVAL;
1477 		}
1478 		if (q->handle == 0) {
1479 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1480 			return -ENOENT;
1481 		}
1482 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1483 		if (err != 0)
1484 			return err;
1485 	} else {
1486 		qdisc_notify(net, skb, n, clid, NULL, q);
1487 	}
1488 	return 0;
1489 }
1490 
1491 /*
1492  * Create/change qdisc.
1493  */
1494 
1495 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1496 			   struct netlink_ext_ack *extack)
1497 {
1498 	struct net *net = sock_net(skb->sk);
1499 	struct tcmsg *tcm;
1500 	struct nlattr *tca[TCA_MAX + 1];
1501 	struct net_device *dev;
1502 	u32 clid;
1503 	struct Qdisc *q, *p;
1504 	int err;
1505 
1506 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1507 		return -EPERM;
1508 
1509 replay:
1510 	/* Reinit, just in case something touches this. */
1511 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1512 			  extack);
1513 	if (err < 0)
1514 		return err;
1515 
1516 	tcm = nlmsg_data(n);
1517 	clid = tcm->tcm_parent;
1518 	q = p = NULL;
1519 
1520 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1521 	if (!dev)
1522 		return -ENODEV;
1523 
1524 
1525 	if (clid) {
1526 		if (clid != TC_H_ROOT) {
1527 			if (clid != TC_H_INGRESS) {
1528 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1529 				if (!p) {
1530 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1531 					return -ENOENT;
1532 				}
1533 				q = qdisc_leaf(p, clid);
1534 			} else if (dev_ingress_queue_create(dev)) {
1535 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1536 			}
1537 		} else {
1538 			q = dev->qdisc;
1539 		}
1540 
1541 		/* It may be default qdisc, ignore it */
1542 		if (q && q->handle == 0)
1543 			q = NULL;
1544 
1545 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1546 			if (tcm->tcm_handle) {
1547 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1548 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1549 					return -EEXIST;
1550 				}
1551 				if (TC_H_MIN(tcm->tcm_handle)) {
1552 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1553 					return -EINVAL;
1554 				}
1555 				q = qdisc_lookup(dev, tcm->tcm_handle);
1556 				if (!q)
1557 					goto create_n_graft;
1558 				if (n->nlmsg_flags & NLM_F_EXCL) {
1559 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1560 					return -EEXIST;
1561 				}
1562 				if (tca[TCA_KIND] &&
1563 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1564 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1565 					return -EINVAL;
1566 				}
1567 				if (q == p ||
1568 				    (p && check_loop(q, p, 0))) {
1569 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1570 					return -ELOOP;
1571 				}
1572 				qdisc_refcount_inc(q);
1573 				goto graft;
1574 			} else {
1575 				if (!q)
1576 					goto create_n_graft;
1577 
1578 				/* This magic test requires explanation.
1579 				 *
1580 				 *   We know, that some child q is already
1581 				 *   attached to this parent and have choice:
1582 				 *   either to change it or to create/graft new one.
1583 				 *
1584 				 *   1. We are allowed to create/graft only
1585 				 *   if CREATE and REPLACE flags are set.
1586 				 *
1587 				 *   2. If EXCL is set, requestor wanted to say,
1588 				 *   that qdisc tcm_handle is not expected
1589 				 *   to exist, so that we choose create/graft too.
1590 				 *
1591 				 *   3. The last case is when no flags are set.
1592 				 *   Alas, it is sort of hole in API, we
1593 				 *   cannot decide what to do unambiguously.
1594 				 *   For now we select create/graft, if
1595 				 *   user gave KIND, which does not match existing.
1596 				 */
1597 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1598 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1599 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1600 				     (tca[TCA_KIND] &&
1601 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1602 					goto create_n_graft;
1603 			}
1604 		}
1605 	} else {
1606 		if (!tcm->tcm_handle) {
1607 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1608 			return -EINVAL;
1609 		}
1610 		q = qdisc_lookup(dev, tcm->tcm_handle);
1611 	}
1612 
1613 	/* Change qdisc parameters */
1614 	if (!q) {
1615 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1616 		return -ENOENT;
1617 	}
1618 	if (n->nlmsg_flags & NLM_F_EXCL) {
1619 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1620 		return -EEXIST;
1621 	}
1622 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1623 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1624 		return -EINVAL;
1625 	}
1626 	err = qdisc_change(q, tca, extack);
1627 	if (err == 0)
1628 		qdisc_notify(net, skb, n, clid, NULL, q);
1629 	return err;
1630 
1631 create_n_graft:
1632 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1633 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1634 		return -ENOENT;
1635 	}
1636 	if (clid == TC_H_INGRESS) {
1637 		if (dev_ingress_queue(dev)) {
1638 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1639 					 tcm->tcm_parent, tcm->tcm_parent,
1640 					 tca, &err, extack);
1641 		} else {
1642 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1643 			err = -ENOENT;
1644 		}
1645 	} else {
1646 		struct netdev_queue *dev_queue;
1647 
1648 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1649 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1650 		else if (p)
1651 			dev_queue = p->dev_queue;
1652 		else
1653 			dev_queue = netdev_get_tx_queue(dev, 0);
1654 
1655 		q = qdisc_create(dev, dev_queue, p,
1656 				 tcm->tcm_parent, tcm->tcm_handle,
1657 				 tca, &err, extack);
1658 	}
1659 	if (q == NULL) {
1660 		if (err == -EAGAIN)
1661 			goto replay;
1662 		return err;
1663 	}
1664 
1665 graft:
1666 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1667 	if (err) {
1668 		if (q)
1669 			qdisc_put(q);
1670 		return err;
1671 	}
1672 
1673 	return 0;
1674 }
1675 
1676 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1677 			      struct netlink_callback *cb,
1678 			      int *q_idx_p, int s_q_idx, bool recur,
1679 			      bool dump_invisible)
1680 {
1681 	int ret = 0, q_idx = *q_idx_p;
1682 	struct Qdisc *q;
1683 	int b;
1684 
1685 	if (!root)
1686 		return 0;
1687 
1688 	q = root;
1689 	if (q_idx < s_q_idx) {
1690 		q_idx++;
1691 	} else {
1692 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1693 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1694 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1695 				  RTM_NEWQDISC) <= 0)
1696 			goto done;
1697 		q_idx++;
1698 	}
1699 
1700 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1701 	 * itself has already been dumped.
1702 	 *
1703 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1704 	 * qdisc hashtable, we don't want to hit it again
1705 	 */
1706 	if (!qdisc_dev(root) || !recur)
1707 		goto out;
1708 
1709 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1710 		if (q_idx < s_q_idx) {
1711 			q_idx++;
1712 			continue;
1713 		}
1714 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1715 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1716 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1717 				  RTM_NEWQDISC) <= 0)
1718 			goto done;
1719 		q_idx++;
1720 	}
1721 
1722 out:
1723 	*q_idx_p = q_idx;
1724 	return ret;
1725 done:
1726 	ret = -1;
1727 	goto out;
1728 }
1729 
1730 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1731 {
1732 	struct net *net = sock_net(skb->sk);
1733 	int idx, q_idx;
1734 	int s_idx, s_q_idx;
1735 	struct net_device *dev;
1736 	const struct nlmsghdr *nlh = cb->nlh;
1737 	struct nlattr *tca[TCA_MAX + 1];
1738 	int err;
1739 
1740 	s_idx = cb->args[0];
1741 	s_q_idx = q_idx = cb->args[1];
1742 
1743 	idx = 0;
1744 	ASSERT_RTNL();
1745 
1746 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1747 			  rtm_tca_policy, cb->extack);
1748 	if (err < 0)
1749 		return err;
1750 
1751 	for_each_netdev(net, dev) {
1752 		struct netdev_queue *dev_queue;
1753 
1754 		if (idx < s_idx)
1755 			goto cont;
1756 		if (idx > s_idx)
1757 			s_q_idx = 0;
1758 		q_idx = 0;
1759 
1760 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1761 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1762 			goto done;
1763 
1764 		dev_queue = dev_ingress_queue(dev);
1765 		if (dev_queue &&
1766 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1767 				       &q_idx, s_q_idx, false,
1768 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1769 			goto done;
1770 
1771 cont:
1772 		idx++;
1773 	}
1774 
1775 done:
1776 	cb->args[0] = idx;
1777 	cb->args[1] = q_idx;
1778 
1779 	return skb->len;
1780 }
1781 
1782 
1783 
1784 /************************************************
1785  *	Traffic classes manipulation.		*
1786  ************************************************/
1787 
1788 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1789 			  unsigned long cl,
1790 			  u32 portid, u32 seq, u16 flags, int event)
1791 {
1792 	struct tcmsg *tcm;
1793 	struct nlmsghdr  *nlh;
1794 	unsigned char *b = skb_tail_pointer(skb);
1795 	struct gnet_dump d;
1796 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1797 
1798 	cond_resched();
1799 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1800 	if (!nlh)
1801 		goto out_nlmsg_trim;
1802 	tcm = nlmsg_data(nlh);
1803 	tcm->tcm_family = AF_UNSPEC;
1804 	tcm->tcm__pad1 = 0;
1805 	tcm->tcm__pad2 = 0;
1806 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1807 	tcm->tcm_parent = q->handle;
1808 	tcm->tcm_handle = q->handle;
1809 	tcm->tcm_info = 0;
1810 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1811 		goto nla_put_failure;
1812 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1813 		goto nla_put_failure;
1814 
1815 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1816 					 NULL, &d, TCA_PAD) < 0)
1817 		goto nla_put_failure;
1818 
1819 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1820 		goto nla_put_failure;
1821 
1822 	if (gnet_stats_finish_copy(&d) < 0)
1823 		goto nla_put_failure;
1824 
1825 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1826 	return skb->len;
1827 
1828 out_nlmsg_trim:
1829 nla_put_failure:
1830 	nlmsg_trim(skb, b);
1831 	return -1;
1832 }
1833 
1834 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1835 			 struct nlmsghdr *n, struct Qdisc *q,
1836 			 unsigned long cl, int event)
1837 {
1838 	struct sk_buff *skb;
1839 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1840 	int err = 0;
1841 
1842 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1843 	if (!skb)
1844 		return -ENOBUFS;
1845 
1846 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1847 		kfree_skb(skb);
1848 		return -EINVAL;
1849 	}
1850 
1851 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1852 			     n->nlmsg_flags & NLM_F_ECHO);
1853 	if (err > 0)
1854 		err = 0;
1855 	return err;
1856 }
1857 
1858 static int tclass_del_notify(struct net *net,
1859 			     const struct Qdisc_class_ops *cops,
1860 			     struct sk_buff *oskb, struct nlmsghdr *n,
1861 			     struct Qdisc *q, unsigned long cl)
1862 {
1863 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1864 	struct sk_buff *skb;
1865 	int err = 0;
1866 
1867 	if (!cops->delete)
1868 		return -EOPNOTSUPP;
1869 
1870 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1871 	if (!skb)
1872 		return -ENOBUFS;
1873 
1874 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1875 			   RTM_DELTCLASS) < 0) {
1876 		kfree_skb(skb);
1877 		return -EINVAL;
1878 	}
1879 
1880 	err = cops->delete(q, cl);
1881 	if (err) {
1882 		kfree_skb(skb);
1883 		return err;
1884 	}
1885 
1886 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1887 			     n->nlmsg_flags & NLM_F_ECHO);
1888 	if (err > 0)
1889 		err = 0;
1890 	return err;
1891 }
1892 
1893 #ifdef CONFIG_NET_CLS
1894 
1895 struct tcf_bind_args {
1896 	struct tcf_walker w;
1897 	u32 classid;
1898 	unsigned long cl;
1899 };
1900 
1901 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1902 {
1903 	struct tcf_bind_args *a = (void *)arg;
1904 
1905 	if (tp->ops->bind_class) {
1906 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1907 
1908 		sch_tree_lock(q);
1909 		tp->ops->bind_class(n, a->classid, a->cl);
1910 		sch_tree_unlock(q);
1911 	}
1912 	return 0;
1913 }
1914 
1915 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1916 			   unsigned long new_cl)
1917 {
1918 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1919 	struct tcf_block *block;
1920 	struct tcf_chain *chain;
1921 	unsigned long cl;
1922 
1923 	cl = cops->find(q, portid);
1924 	if (!cl)
1925 		return;
1926 	block = cops->tcf_block(q, cl, NULL);
1927 	if (!block)
1928 		return;
1929 	for (chain = tcf_get_next_chain(block, NULL);
1930 	     chain;
1931 	     chain = tcf_get_next_chain(block, chain)) {
1932 		struct tcf_proto *tp;
1933 
1934 		for (tp = tcf_get_next_proto(chain, NULL, true);
1935 		     tp; tp = tcf_get_next_proto(chain, tp, true)) {
1936 			struct tcf_bind_args arg = {};
1937 
1938 			arg.w.fn = tcf_node_bind;
1939 			arg.classid = clid;
1940 			arg.cl = new_cl;
1941 			tp->ops->walk(tp, &arg.w, true);
1942 		}
1943 	}
1944 }
1945 
1946 #else
1947 
1948 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1949 			   unsigned long new_cl)
1950 {
1951 }
1952 
1953 #endif
1954 
1955 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1956 			 struct netlink_ext_ack *extack)
1957 {
1958 	struct net *net = sock_net(skb->sk);
1959 	struct tcmsg *tcm = nlmsg_data(n);
1960 	struct nlattr *tca[TCA_MAX + 1];
1961 	struct net_device *dev;
1962 	struct Qdisc *q = NULL;
1963 	const struct Qdisc_class_ops *cops;
1964 	unsigned long cl = 0;
1965 	unsigned long new_cl;
1966 	u32 portid;
1967 	u32 clid;
1968 	u32 qid;
1969 	int err;
1970 
1971 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1972 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1973 		return -EPERM;
1974 
1975 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1976 			  extack);
1977 	if (err < 0)
1978 		return err;
1979 
1980 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1981 	if (!dev)
1982 		return -ENODEV;
1983 
1984 	/*
1985 	   parent == TC_H_UNSPEC - unspecified parent.
1986 	   parent == TC_H_ROOT   - class is root, which has no parent.
1987 	   parent == X:0	 - parent is root class.
1988 	   parent == X:Y	 - parent is a node in hierarchy.
1989 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1990 
1991 	   handle == 0:0	 - generate handle from kernel pool.
1992 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1993 	   handle == X:Y	 - clear.
1994 	   handle == X:0	 - root class.
1995 	 */
1996 
1997 	/* Step 1. Determine qdisc handle X:0 */
1998 
1999 	portid = tcm->tcm_parent;
2000 	clid = tcm->tcm_handle;
2001 	qid = TC_H_MAJ(clid);
2002 
2003 	if (portid != TC_H_ROOT) {
2004 		u32 qid1 = TC_H_MAJ(portid);
2005 
2006 		if (qid && qid1) {
2007 			/* If both majors are known, they must be identical. */
2008 			if (qid != qid1)
2009 				return -EINVAL;
2010 		} else if (qid1) {
2011 			qid = qid1;
2012 		} else if (qid == 0)
2013 			qid = dev->qdisc->handle;
2014 
2015 		/* Now qid is genuine qdisc handle consistent
2016 		 * both with parent and child.
2017 		 *
2018 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2019 		 */
2020 		if (portid)
2021 			portid = TC_H_MAKE(qid, portid);
2022 	} else {
2023 		if (qid == 0)
2024 			qid = dev->qdisc->handle;
2025 	}
2026 
2027 	/* OK. Locate qdisc */
2028 	q = qdisc_lookup(dev, qid);
2029 	if (!q)
2030 		return -ENOENT;
2031 
2032 	/* An check that it supports classes */
2033 	cops = q->ops->cl_ops;
2034 	if (cops == NULL)
2035 		return -EINVAL;
2036 
2037 	/* Now try to get class */
2038 	if (clid == 0) {
2039 		if (portid == TC_H_ROOT)
2040 			clid = qid;
2041 	} else
2042 		clid = TC_H_MAKE(qid, clid);
2043 
2044 	if (clid)
2045 		cl = cops->find(q, clid);
2046 
2047 	if (cl == 0) {
2048 		err = -ENOENT;
2049 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2050 		    !(n->nlmsg_flags & NLM_F_CREATE))
2051 			goto out;
2052 	} else {
2053 		switch (n->nlmsg_type) {
2054 		case RTM_NEWTCLASS:
2055 			err = -EEXIST;
2056 			if (n->nlmsg_flags & NLM_F_EXCL)
2057 				goto out;
2058 			break;
2059 		case RTM_DELTCLASS:
2060 			err = tclass_del_notify(net, cops, skb, n, q, cl);
2061 			/* Unbind the class with flilters with 0 */
2062 			tc_bind_tclass(q, portid, clid, 0);
2063 			goto out;
2064 		case RTM_GETTCLASS:
2065 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2066 			goto out;
2067 		default:
2068 			err = -EINVAL;
2069 			goto out;
2070 		}
2071 	}
2072 
2073 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2074 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2075 		return -EOPNOTSUPP;
2076 	}
2077 
2078 	new_cl = cl;
2079 	err = -EOPNOTSUPP;
2080 	if (cops->change)
2081 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2082 	if (err == 0) {
2083 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2084 		/* We just create a new class, need to do reverse binding. */
2085 		if (cl != new_cl)
2086 			tc_bind_tclass(q, portid, clid, new_cl);
2087 	}
2088 out:
2089 	return err;
2090 }
2091 
2092 struct qdisc_dump_args {
2093 	struct qdisc_walker	w;
2094 	struct sk_buff		*skb;
2095 	struct netlink_callback	*cb;
2096 };
2097 
2098 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2099 			    struct qdisc_walker *arg)
2100 {
2101 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2102 
2103 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2104 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2105 			      RTM_NEWTCLASS);
2106 }
2107 
2108 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2109 				struct tcmsg *tcm, struct netlink_callback *cb,
2110 				int *t_p, int s_t)
2111 {
2112 	struct qdisc_dump_args arg;
2113 
2114 	if (tc_qdisc_dump_ignore(q, false) ||
2115 	    *t_p < s_t || !q->ops->cl_ops ||
2116 	    (tcm->tcm_parent &&
2117 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2118 		(*t_p)++;
2119 		return 0;
2120 	}
2121 	if (*t_p > s_t)
2122 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2123 	arg.w.fn = qdisc_class_dump;
2124 	arg.skb = skb;
2125 	arg.cb = cb;
2126 	arg.w.stop  = 0;
2127 	arg.w.skip = cb->args[1];
2128 	arg.w.count = 0;
2129 	q->ops->cl_ops->walk(q, &arg.w);
2130 	cb->args[1] = arg.w.count;
2131 	if (arg.w.stop)
2132 		return -1;
2133 	(*t_p)++;
2134 	return 0;
2135 }
2136 
2137 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2138 			       struct tcmsg *tcm, struct netlink_callback *cb,
2139 			       int *t_p, int s_t)
2140 {
2141 	struct Qdisc *q;
2142 	int b;
2143 
2144 	if (!root)
2145 		return 0;
2146 
2147 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2148 		return -1;
2149 
2150 	if (!qdisc_dev(root))
2151 		return 0;
2152 
2153 	if (tcm->tcm_parent) {
2154 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2155 		if (q && q != root &&
2156 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2157 			return -1;
2158 		return 0;
2159 	}
2160 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2161 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2162 			return -1;
2163 	}
2164 
2165 	return 0;
2166 }
2167 
2168 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2169 {
2170 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2171 	struct net *net = sock_net(skb->sk);
2172 	struct netdev_queue *dev_queue;
2173 	struct net_device *dev;
2174 	int t, s_t;
2175 
2176 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2177 		return 0;
2178 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2179 	if (!dev)
2180 		return 0;
2181 
2182 	s_t = cb->args[0];
2183 	t = 0;
2184 
2185 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2186 		goto done;
2187 
2188 	dev_queue = dev_ingress_queue(dev);
2189 	if (dev_queue &&
2190 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2191 				&t, s_t) < 0)
2192 		goto done;
2193 
2194 done:
2195 	cb->args[0] = t;
2196 
2197 	dev_put(dev);
2198 	return skb->len;
2199 }
2200 
2201 #ifdef CONFIG_PROC_FS
2202 static int psched_show(struct seq_file *seq, void *v)
2203 {
2204 	seq_printf(seq, "%08x %08x %08x %08x\n",
2205 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2206 		   1000000,
2207 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2208 
2209 	return 0;
2210 }
2211 
2212 static int __net_init psched_net_init(struct net *net)
2213 {
2214 	struct proc_dir_entry *e;
2215 
2216 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2217 	if (e == NULL)
2218 		return -ENOMEM;
2219 
2220 	return 0;
2221 }
2222 
2223 static void __net_exit psched_net_exit(struct net *net)
2224 {
2225 	remove_proc_entry("psched", net->proc_net);
2226 }
2227 #else
2228 static int __net_init psched_net_init(struct net *net)
2229 {
2230 	return 0;
2231 }
2232 
2233 static void __net_exit psched_net_exit(struct net *net)
2234 {
2235 }
2236 #endif
2237 
2238 static struct pernet_operations psched_net_ops = {
2239 	.init = psched_net_init,
2240 	.exit = psched_net_exit,
2241 };
2242 
2243 static int __init pktsched_init(void)
2244 {
2245 	int err;
2246 
2247 	err = register_pernet_subsys(&psched_net_ops);
2248 	if (err) {
2249 		pr_err("pktsched_init: "
2250 		       "cannot initialize per netns operations\n");
2251 		return err;
2252 	}
2253 
2254 	register_qdisc(&pfifo_fast_ops);
2255 	register_qdisc(&pfifo_qdisc_ops);
2256 	register_qdisc(&bfifo_qdisc_ops);
2257 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2258 	register_qdisc(&mq_qdisc_ops);
2259 	register_qdisc(&noqueue_qdisc_ops);
2260 
2261 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2262 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2263 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2264 		      0);
2265 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2266 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2267 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2268 		      0);
2269 
2270 	return 0;
2271 }
2272 
2273 subsys_initcall(pktsched_init);
2274