xref: /openbmc/linux/net/sched/sch_api.c (revision 7bd571b274fd15e0e7dc3d79d104f32928010eff)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c	Packet scheduler API.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13 
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28 
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35 
36 #include <trace/events/qdisc.h>
37 
38 /*
39 
40    Short review.
41    -------------
42 
43    This file consists of two interrelated parts:
44 
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47 
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52 
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57 
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60 
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66 
67    All real intelligent work is done inside qdisc modules.
68 
69 
70 
71    Every discipline has two major routines: enqueue and dequeue.
72 
73    ---dequeue
74 
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81 
82    ---enqueue
83 
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP 	- this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91 
92    Auxiliary routines:
93 
94    ---peek
95 
96    like dequeue but without removing a packet from the queue
97 
98    ---reset
99 
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102 
103    ---init
104 
105    initializes newly created qdisc.
106 
107    ---destroy
108 
109    destroys resources allocated by init and during lifetime of qdisc.
110 
111    ---change
112 
113    changes qdisc parameters.
114  */
115 
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118 
119 
120 /************************************************
121  *	Queueing disciplines manipulation.	*
122  ************************************************/
123 
124 
125 /* The list of all installed queueing disciplines. */
126 
127 static struct Qdisc_ops *qdisc_base;
128 
129 /* Register/unregister queueing discipline */
130 
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133 	struct Qdisc_ops *q, **qp;
134 	int rc = -EEXIST;
135 
136 	write_lock(&qdisc_mod_lock);
137 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138 		if (!strcmp(qops->id, q->id))
139 			goto out;
140 
141 	if (qops->enqueue == NULL)
142 		qops->enqueue = noop_qdisc_ops.enqueue;
143 	if (qops->peek == NULL) {
144 		if (qops->dequeue == NULL)
145 			qops->peek = noop_qdisc_ops.peek;
146 		else
147 			goto out_einval;
148 	}
149 	if (qops->dequeue == NULL)
150 		qops->dequeue = noop_qdisc_ops.dequeue;
151 
152 	if (qops->cl_ops) {
153 		const struct Qdisc_class_ops *cops = qops->cl_ops;
154 
155 		if (!(cops->find && cops->walk && cops->leaf))
156 			goto out_einval;
157 
158 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159 			goto out_einval;
160 	}
161 
162 	qops->next = NULL;
163 	*qp = qops;
164 	rc = 0;
165 out:
166 	write_unlock(&qdisc_mod_lock);
167 	return rc;
168 
169 out_einval:
170 	rc = -EINVAL;
171 	goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174 
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177 	struct Qdisc_ops *q, **qp;
178 	int err = -ENOENT;
179 
180 	write_lock(&qdisc_mod_lock);
181 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182 		if (q == qops)
183 			break;
184 	if (q) {
185 		*qp = q->next;
186 		q->next = NULL;
187 		err = 0;
188 	}
189 	write_unlock(&qdisc_mod_lock);
190 
191 	WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strscpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module("sch_%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273 				   lockdep_rtnl_is_held()) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320 	struct netdev_queue *nq;
321 	struct Qdisc *q;
322 
323 	if (!handle)
324 		return NULL;
325 	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326 	if (q)
327 		goto out;
328 
329 	nq = dev_ingress_queue_rcu(dev);
330 	if (nq)
331 		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
332 out:
333 	return q;
334 }
335 
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338 	unsigned long cl;
339 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340 
341 	if (cops == NULL)
342 		return NULL;
343 	cl = cops->find(p, classid);
344 
345 	if (cl == 0)
346 		return NULL;
347 	return cops->leaf(p, cl);
348 }
349 
350 /* Find queueing discipline by name */
351 
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354 	struct Qdisc_ops *q = NULL;
355 
356 	if (kind) {
357 		read_lock(&qdisc_mod_lock);
358 		for (q = qdisc_base; q; q = q->next) {
359 			if (nla_strcmp(kind, q->id) == 0) {
360 				if (!try_module_get(q->owner))
361 					q = NULL;
362 				break;
363 			}
364 		}
365 		read_unlock(&qdisc_mod_lock);
366 	}
367 	return q;
368 }
369 
370 /* The linklayer setting were not transferred from iproute2, in older
371  * versions, and the rate tables lookup systems have been dropped in
372  * the kernel. To keep backward compatible with older iproute2 tc
373  * utils, we detect the linklayer setting by detecting if the rate
374  * table were modified.
375  *
376  * For linklayer ATM table entries, the rate table will be aligned to
377  * 48 bytes, thus some table entries will contain the same value.  The
378  * mpu (min packet unit) is also encoded into the old rate table, thus
379  * starting from the mpu, we find low and high table entries for
380  * mapping this cell.  If these entries contain the same value, when
381  * the rate tables have been modified for linklayer ATM.
382  *
383  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384  * and then roundup to the next cell, calc the table entry one below,
385  * and compare.
386  */
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389 	int low       = roundup(r->mpu, 48);
390 	int high      = roundup(low+1, 48);
391 	int cell_low  = low >> r->cell_log;
392 	int cell_high = (high >> r->cell_log) - 1;
393 
394 	/* rtab is too inaccurate at rates > 100Mbit/s */
395 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396 		pr_debug("TC linklayer: Giving up ATM detection\n");
397 		return TC_LINKLAYER_ETHERNET;
398 	}
399 
400 	if ((cell_high > cell_low) && (cell_high < 256)
401 	    && (rtab[cell_low] == rtab[cell_high])) {
402 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403 			 cell_low, cell_high, rtab[cell_high]);
404 		return TC_LINKLAYER_ATM;
405 	}
406 	return TC_LINKLAYER_ETHERNET;
407 }
408 
409 static struct qdisc_rate_table *qdisc_rtab_list;
410 
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412 					struct nlattr *tab,
413 					struct netlink_ext_ack *extack)
414 {
415 	struct qdisc_rate_table *rtab;
416 
417 	if (tab == NULL || r->rate == 0 ||
418 	    r->cell_log == 0 || r->cell_log >= 32 ||
419 	    nla_len(tab) != TC_RTAB_SIZE) {
420 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421 		return NULL;
422 	}
423 
424 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
427 			rtab->refcnt++;
428 			return rtab;
429 		}
430 	}
431 
432 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433 	if (rtab) {
434 		rtab->rate = *r;
435 		rtab->refcnt = 1;
436 		memcpy(rtab->data, nla_data(tab), 1024);
437 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
438 			r->linklayer = __detect_linklayer(r, rtab->data);
439 		rtab->next = qdisc_rtab_list;
440 		qdisc_rtab_list = rtab;
441 	} else {
442 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443 	}
444 	return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447 
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450 	struct qdisc_rate_table *rtab, **rtabp;
451 
452 	if (!tab || --tab->refcnt)
453 		return;
454 
455 	for (rtabp = &qdisc_rtab_list;
456 	     (rtab = *rtabp) != NULL;
457 	     rtabp = &rtab->next) {
458 		if (rtab == tab) {
459 			*rtabp = rtab->next;
460 			kfree(rtab);
461 			return;
462 		}
463 	}
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466 
467 static LIST_HEAD(qdisc_stab_list);
468 
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
471 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473 
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475 					       struct netlink_ext_ack *extack)
476 {
477 	struct nlattr *tb[TCA_STAB_MAX + 1];
478 	struct qdisc_size_table *stab;
479 	struct tc_sizespec *s;
480 	unsigned int tsize = 0;
481 	u16 *tab = NULL;
482 	int err;
483 
484 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485 					  extack);
486 	if (err < 0)
487 		return ERR_PTR(err);
488 	if (!tb[TCA_STAB_BASE]) {
489 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490 		return ERR_PTR(-EINVAL);
491 	}
492 
493 	s = nla_data(tb[TCA_STAB_BASE]);
494 
495 	if (s->tsize > 0) {
496 		if (!tb[TCA_STAB_DATA]) {
497 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498 			return ERR_PTR(-EINVAL);
499 		}
500 		tab = nla_data(tb[TCA_STAB_DATA]);
501 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502 	}
503 
504 	if (tsize != s->tsize || (!tab && tsize > 0)) {
505 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
506 		return ERR_PTR(-EINVAL);
507 	}
508 
509 	list_for_each_entry(stab, &qdisc_stab_list, list) {
510 		if (memcmp(&stab->szopts, s, sizeof(*s)))
511 			continue;
512 		if (tsize > 0 &&
513 		    memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514 			continue;
515 		stab->refcnt++;
516 		return stab;
517 	}
518 
519 	if (s->size_log > STAB_SIZE_LOG_MAX ||
520 	    s->cell_log > STAB_SIZE_LOG_MAX) {
521 		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522 		return ERR_PTR(-EINVAL);
523 	}
524 
525 	stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526 	if (!stab)
527 		return ERR_PTR(-ENOMEM);
528 
529 	stab->refcnt = 1;
530 	stab->szopts = *s;
531 	if (tsize > 0)
532 		memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533 
534 	list_add_tail(&stab->list, &qdisc_stab_list);
535 
536 	return stab;
537 }
538 
539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541 	if (!tab)
542 		return;
543 
544 	if (--tab->refcnt == 0) {
545 		list_del(&tab->list);
546 		kfree_rcu(tab, rcu);
547 	}
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550 
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553 	struct nlattr *nest;
554 
555 	nest = nla_nest_start_noflag(skb, TCA_STAB);
556 	if (nest == NULL)
557 		goto nla_put_failure;
558 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559 		goto nla_put_failure;
560 	nla_nest_end(skb, nest);
561 
562 	return skb->len;
563 
564 nla_put_failure:
565 	return -1;
566 }
567 
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569 			       const struct qdisc_size_table *stab)
570 {
571 	int pkt_len, slot;
572 
573 	pkt_len = skb->len + stab->szopts.overhead;
574 	if (unlikely(!stab->szopts.tsize))
575 		goto out;
576 
577 	slot = pkt_len + stab->szopts.cell_align;
578 	if (unlikely(slot < 0))
579 		slot = 0;
580 
581 	slot >>= stab->szopts.cell_log;
582 	if (likely(slot < stab->szopts.tsize))
583 		pkt_len = stab->data[slot];
584 	else
585 		pkt_len = stab->data[stab->szopts.tsize - 1] *
586 				(slot / stab->szopts.tsize) +
587 				stab->data[slot % stab->szopts.tsize];
588 
589 	pkt_len <<= stab->szopts.size_log;
590 out:
591 	if (unlikely(pkt_len < 1))
592 		pkt_len = 1;
593 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596 
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601 			txt, qdisc->ops->id, qdisc->handle >> 16);
602 		qdisc->flags |= TCQ_F_WARN_NONWC;
603 	}
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606 
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610 						 timer);
611 
612 	rcu_read_lock();
613 	__netif_schedule(qdisc_root(wd->qdisc));
614 	rcu_read_unlock();
615 
616 	return HRTIMER_NORESTART;
617 }
618 
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620 				 clockid_t clockid)
621 {
622 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623 	wd->timer.function = qdisc_watchdog;
624 	wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627 
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633 
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635 				      u64 delta_ns)
636 {
637 	if (test_bit(__QDISC_STATE_DEACTIVATED,
638 		     &qdisc_root_sleeping(wd->qdisc)->state))
639 		return;
640 
641 	if (hrtimer_is_queued(&wd->timer)) {
642 		/* If timer is already set in [expires, expires + delta_ns],
643 		 * do not reprogram it.
644 		 */
645 		if (wd->last_expires - expires <= delta_ns)
646 			return;
647 	}
648 
649 	wd->last_expires = expires;
650 	hrtimer_start_range_ns(&wd->timer,
651 			       ns_to_ktime(expires),
652 			       delta_ns,
653 			       HRTIMER_MODE_ABS_PINNED);
654 }
655 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
656 
657 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
658 {
659 	hrtimer_cancel(&wd->timer);
660 }
661 EXPORT_SYMBOL(qdisc_watchdog_cancel);
662 
663 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
664 {
665 	struct hlist_head *h;
666 	unsigned int i;
667 
668 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
669 
670 	if (h != NULL) {
671 		for (i = 0; i < n; i++)
672 			INIT_HLIST_HEAD(&h[i]);
673 	}
674 	return h;
675 }
676 
677 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
678 {
679 	struct Qdisc_class_common *cl;
680 	struct hlist_node *next;
681 	struct hlist_head *nhash, *ohash;
682 	unsigned int nsize, nmask, osize;
683 	unsigned int i, h;
684 
685 	/* Rehash when load factor exceeds 0.75 */
686 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
687 		return;
688 	nsize = clhash->hashsize * 2;
689 	nmask = nsize - 1;
690 	nhash = qdisc_class_hash_alloc(nsize);
691 	if (nhash == NULL)
692 		return;
693 
694 	ohash = clhash->hash;
695 	osize = clhash->hashsize;
696 
697 	sch_tree_lock(sch);
698 	for (i = 0; i < osize; i++) {
699 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
700 			h = qdisc_class_hash(cl->classid, nmask);
701 			hlist_add_head(&cl->hnode, &nhash[h]);
702 		}
703 	}
704 	clhash->hash     = nhash;
705 	clhash->hashsize = nsize;
706 	clhash->hashmask = nmask;
707 	sch_tree_unlock(sch);
708 
709 	kvfree(ohash);
710 }
711 EXPORT_SYMBOL(qdisc_class_hash_grow);
712 
713 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
714 {
715 	unsigned int size = 4;
716 
717 	clhash->hash = qdisc_class_hash_alloc(size);
718 	if (!clhash->hash)
719 		return -ENOMEM;
720 	clhash->hashsize  = size;
721 	clhash->hashmask  = size - 1;
722 	clhash->hashelems = 0;
723 	return 0;
724 }
725 EXPORT_SYMBOL(qdisc_class_hash_init);
726 
727 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
728 {
729 	kvfree(clhash->hash);
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_destroy);
732 
733 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
734 			     struct Qdisc_class_common *cl)
735 {
736 	unsigned int h;
737 
738 	INIT_HLIST_NODE(&cl->hnode);
739 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
740 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
741 	clhash->hashelems++;
742 }
743 EXPORT_SYMBOL(qdisc_class_hash_insert);
744 
745 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
746 			     struct Qdisc_class_common *cl)
747 {
748 	hlist_del(&cl->hnode);
749 	clhash->hashelems--;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_remove);
752 
753 /* Allocate an unique handle from space managed by kernel
754  * Possible range is [8000-FFFF]:0000 (0x8000 values)
755  */
756 static u32 qdisc_alloc_handle(struct net_device *dev)
757 {
758 	int i = 0x8000;
759 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
760 
761 	do {
762 		autohandle += TC_H_MAKE(0x10000U, 0);
763 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
764 			autohandle = TC_H_MAKE(0x80000000U, 0);
765 		if (!qdisc_lookup(dev, autohandle))
766 			return autohandle;
767 		cond_resched();
768 	} while	(--i > 0);
769 
770 	return 0;
771 }
772 
773 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
774 {
775 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
776 	const struct Qdisc_class_ops *cops;
777 	unsigned long cl;
778 	u32 parentid;
779 	bool notify;
780 	int drops;
781 
782 	if (n == 0 && len == 0)
783 		return;
784 	drops = max_t(int, n, 0);
785 	rcu_read_lock();
786 	while ((parentid = sch->parent)) {
787 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
788 			break;
789 
790 		if (sch->flags & TCQ_F_NOPARENT)
791 			break;
792 		/* Notify parent qdisc only if child qdisc becomes empty.
793 		 *
794 		 * If child was empty even before update then backlog
795 		 * counter is screwed and we skip notification because
796 		 * parent class is already passive.
797 		 *
798 		 * If the original child was offloaded then it is allowed
799 		 * to be seem as empty, so the parent is notified anyway.
800 		 */
801 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
802 						       !qdisc_is_offloaded);
803 		/* TODO: perform the search on a per txq basis */
804 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
805 		if (sch == NULL) {
806 			WARN_ON_ONCE(parentid != TC_H_ROOT);
807 			break;
808 		}
809 		cops = sch->ops->cl_ops;
810 		if (notify && cops->qlen_notify) {
811 			cl = cops->find(sch, parentid);
812 			cops->qlen_notify(sch, cl);
813 		}
814 		sch->q.qlen -= n;
815 		sch->qstats.backlog -= len;
816 		__qdisc_qstats_drop(sch, drops);
817 	}
818 	rcu_read_unlock();
819 }
820 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
821 
822 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
823 			      void *type_data)
824 {
825 	struct net_device *dev = qdisc_dev(sch);
826 	int err;
827 
828 	sch->flags &= ~TCQ_F_OFFLOADED;
829 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
830 		return 0;
831 
832 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
833 	if (err == -EOPNOTSUPP)
834 		return 0;
835 
836 	if (!err)
837 		sch->flags |= TCQ_F_OFFLOADED;
838 
839 	return err;
840 }
841 EXPORT_SYMBOL(qdisc_offload_dump_helper);
842 
843 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
844 				struct Qdisc *new, struct Qdisc *old,
845 				enum tc_setup_type type, void *type_data,
846 				struct netlink_ext_ack *extack)
847 {
848 	bool any_qdisc_is_offloaded;
849 	int err;
850 
851 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
852 		return;
853 
854 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
855 
856 	/* Don't report error if the graft is part of destroy operation. */
857 	if (!err || !new || new == &noop_qdisc)
858 		return;
859 
860 	/* Don't report error if the parent, the old child and the new
861 	 * one are not offloaded.
862 	 */
863 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
864 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
865 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
866 
867 	if (any_qdisc_is_offloaded)
868 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
869 }
870 EXPORT_SYMBOL(qdisc_offload_graft_helper);
871 
872 void qdisc_offload_query_caps(struct net_device *dev,
873 			      enum tc_setup_type type,
874 			      void *caps, size_t caps_len)
875 {
876 	const struct net_device_ops *ops = dev->netdev_ops;
877 	struct tc_query_caps_base base = {
878 		.type = type,
879 		.caps = caps,
880 	};
881 
882 	memset(caps, 0, caps_len);
883 
884 	if (ops->ndo_setup_tc)
885 		ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
886 }
887 EXPORT_SYMBOL(qdisc_offload_query_caps);
888 
889 static void qdisc_offload_graft_root(struct net_device *dev,
890 				     struct Qdisc *new, struct Qdisc *old,
891 				     struct netlink_ext_ack *extack)
892 {
893 	struct tc_root_qopt_offload graft_offload = {
894 		.command	= TC_ROOT_GRAFT,
895 		.handle		= new ? new->handle : 0,
896 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
897 				  (old && old->flags & TCQ_F_INGRESS),
898 	};
899 
900 	qdisc_offload_graft_helper(dev, NULL, new, old,
901 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
902 }
903 
904 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
905 			 u32 portid, u32 seq, u16 flags, int event)
906 {
907 	struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
908 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
909 	struct tcmsg *tcm;
910 	struct nlmsghdr  *nlh;
911 	unsigned char *b = skb_tail_pointer(skb);
912 	struct gnet_dump d;
913 	struct qdisc_size_table *stab;
914 	u32 block_index;
915 	__u32 qlen;
916 
917 	cond_resched();
918 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
919 	if (!nlh)
920 		goto out_nlmsg_trim;
921 	tcm = nlmsg_data(nlh);
922 	tcm->tcm_family = AF_UNSPEC;
923 	tcm->tcm__pad1 = 0;
924 	tcm->tcm__pad2 = 0;
925 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
926 	tcm->tcm_parent = clid;
927 	tcm->tcm_handle = q->handle;
928 	tcm->tcm_info = refcount_read(&q->refcnt);
929 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
930 		goto nla_put_failure;
931 	if (q->ops->ingress_block_get) {
932 		block_index = q->ops->ingress_block_get(q);
933 		if (block_index &&
934 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
935 			goto nla_put_failure;
936 	}
937 	if (q->ops->egress_block_get) {
938 		block_index = q->ops->egress_block_get(q);
939 		if (block_index &&
940 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
941 			goto nla_put_failure;
942 	}
943 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
944 		goto nla_put_failure;
945 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
946 		goto nla_put_failure;
947 	qlen = qdisc_qlen_sum(q);
948 
949 	stab = rtnl_dereference(q->stab);
950 	if (stab && qdisc_dump_stab(skb, stab) < 0)
951 		goto nla_put_failure;
952 
953 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
954 					 NULL, &d, TCA_PAD) < 0)
955 		goto nla_put_failure;
956 
957 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
958 		goto nla_put_failure;
959 
960 	if (qdisc_is_percpu_stats(q)) {
961 		cpu_bstats = q->cpu_bstats;
962 		cpu_qstats = q->cpu_qstats;
963 	}
964 
965 	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
966 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
967 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
968 		goto nla_put_failure;
969 
970 	if (gnet_stats_finish_copy(&d) < 0)
971 		goto nla_put_failure;
972 
973 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
974 	return skb->len;
975 
976 out_nlmsg_trim:
977 nla_put_failure:
978 	nlmsg_trim(skb, b);
979 	return -1;
980 }
981 
982 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
983 {
984 	if (q->flags & TCQ_F_BUILTIN)
985 		return true;
986 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
987 		return true;
988 
989 	return false;
990 }
991 
992 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
993 			struct nlmsghdr *n, u32 clid,
994 			struct Qdisc *old, struct Qdisc *new)
995 {
996 	struct sk_buff *skb;
997 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
998 
999 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1000 	if (!skb)
1001 		return -ENOBUFS;
1002 
1003 	if (old && !tc_qdisc_dump_ignore(old, false)) {
1004 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1005 				  0, RTM_DELQDISC) < 0)
1006 			goto err_out;
1007 	}
1008 	if (new && !tc_qdisc_dump_ignore(new, false)) {
1009 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1010 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1011 			goto err_out;
1012 	}
1013 
1014 	if (skb->len)
1015 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1016 				      n->nlmsg_flags & NLM_F_ECHO);
1017 
1018 err_out:
1019 	kfree_skb(skb);
1020 	return -EINVAL;
1021 }
1022 
1023 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1024 			       struct nlmsghdr *n, u32 clid,
1025 			       struct Qdisc *old, struct Qdisc *new)
1026 {
1027 	if (new || old)
1028 		qdisc_notify(net, skb, n, clid, old, new);
1029 
1030 	if (old)
1031 		qdisc_put(old);
1032 }
1033 
1034 static void qdisc_clear_nolock(struct Qdisc *sch)
1035 {
1036 	sch->flags &= ~TCQ_F_NOLOCK;
1037 	if (!(sch->flags & TCQ_F_CPUSTATS))
1038 		return;
1039 
1040 	free_percpu(sch->cpu_bstats);
1041 	free_percpu(sch->cpu_qstats);
1042 	sch->cpu_bstats = NULL;
1043 	sch->cpu_qstats = NULL;
1044 	sch->flags &= ~TCQ_F_CPUSTATS;
1045 }
1046 
1047 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1048  * to device "dev".
1049  *
1050  * When appropriate send a netlink notification using 'skb'
1051  * and "n".
1052  *
1053  * On success, destroy old qdisc.
1054  */
1055 
1056 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1057 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1058 		       struct Qdisc *new, struct Qdisc *old,
1059 		       struct netlink_ext_ack *extack)
1060 {
1061 	struct Qdisc *q = old;
1062 	struct net *net = dev_net(dev);
1063 
1064 	if (parent == NULL) {
1065 		unsigned int i, num_q, ingress;
1066 
1067 		ingress = 0;
1068 		num_q = dev->num_tx_queues;
1069 		if ((q && q->flags & TCQ_F_INGRESS) ||
1070 		    (new && new->flags & TCQ_F_INGRESS)) {
1071 			num_q = 1;
1072 			ingress = 1;
1073 			if (!dev_ingress_queue(dev)) {
1074 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1075 				return -ENOENT;
1076 			}
1077 		}
1078 
1079 		if (dev->flags & IFF_UP)
1080 			dev_deactivate(dev);
1081 
1082 		qdisc_offload_graft_root(dev, new, old, extack);
1083 
1084 		if (new && new->ops->attach && !ingress)
1085 			goto skip;
1086 
1087 		for (i = 0; i < num_q; i++) {
1088 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1089 
1090 			if (!ingress)
1091 				dev_queue = netdev_get_tx_queue(dev, i);
1092 
1093 			old = dev_graft_qdisc(dev_queue, new);
1094 			if (new && i > 0)
1095 				qdisc_refcount_inc(new);
1096 
1097 			if (!ingress)
1098 				qdisc_put(old);
1099 		}
1100 
1101 skip:
1102 		if (!ingress) {
1103 			old = rtnl_dereference(dev->qdisc);
1104 			if (new && !new->ops->attach)
1105 				qdisc_refcount_inc(new);
1106 			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1107 
1108 			notify_and_destroy(net, skb, n, classid, old, new);
1109 
1110 			if (new && new->ops->attach)
1111 				new->ops->attach(new);
1112 		} else {
1113 			notify_and_destroy(net, skb, n, classid, old, new);
1114 		}
1115 
1116 		if (dev->flags & IFF_UP)
1117 			dev_activate(dev);
1118 	} else {
1119 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1120 		unsigned long cl;
1121 		int err;
1122 
1123 		/* Only support running class lockless if parent is lockless */
1124 		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1125 			qdisc_clear_nolock(new);
1126 
1127 		if (!cops || !cops->graft)
1128 			return -EOPNOTSUPP;
1129 
1130 		cl = cops->find(parent, classid);
1131 		if (!cl) {
1132 			NL_SET_ERR_MSG(extack, "Specified class not found");
1133 			return -ENOENT;
1134 		}
1135 
1136 		if (new && new->ops == &noqueue_qdisc_ops) {
1137 			NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1138 			return -EINVAL;
1139 		}
1140 
1141 		err = cops->graft(parent, cl, new, &old, extack);
1142 		if (err)
1143 			return err;
1144 		notify_and_destroy(net, skb, n, classid, old, new);
1145 	}
1146 	return 0;
1147 }
1148 
1149 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1150 				   struct netlink_ext_ack *extack)
1151 {
1152 	u32 block_index;
1153 
1154 	if (tca[TCA_INGRESS_BLOCK]) {
1155 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1156 
1157 		if (!block_index) {
1158 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1159 			return -EINVAL;
1160 		}
1161 		if (!sch->ops->ingress_block_set) {
1162 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1163 			return -EOPNOTSUPP;
1164 		}
1165 		sch->ops->ingress_block_set(sch, block_index);
1166 	}
1167 	if (tca[TCA_EGRESS_BLOCK]) {
1168 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1169 
1170 		if (!block_index) {
1171 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1172 			return -EINVAL;
1173 		}
1174 		if (!sch->ops->egress_block_set) {
1175 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1176 			return -EOPNOTSUPP;
1177 		}
1178 		sch->ops->egress_block_set(sch, block_index);
1179 	}
1180 	return 0;
1181 }
1182 
1183 /*
1184    Allocate and initialize new qdisc.
1185 
1186    Parameters are passed via opt.
1187  */
1188 
1189 static struct Qdisc *qdisc_create(struct net_device *dev,
1190 				  struct netdev_queue *dev_queue,
1191 				  u32 parent, u32 handle,
1192 				  struct nlattr **tca, int *errp,
1193 				  struct netlink_ext_ack *extack)
1194 {
1195 	int err;
1196 	struct nlattr *kind = tca[TCA_KIND];
1197 	struct Qdisc *sch;
1198 	struct Qdisc_ops *ops;
1199 	struct qdisc_size_table *stab;
1200 
1201 	ops = qdisc_lookup_ops(kind);
1202 #ifdef CONFIG_MODULES
1203 	if (ops == NULL && kind != NULL) {
1204 		char name[IFNAMSIZ];
1205 		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1206 			/* We dropped the RTNL semaphore in order to
1207 			 * perform the module load.  So, even if we
1208 			 * succeeded in loading the module we have to
1209 			 * tell the caller to replay the request.  We
1210 			 * indicate this using -EAGAIN.
1211 			 * We replay the request because the device may
1212 			 * go away in the mean time.
1213 			 */
1214 			rtnl_unlock();
1215 			request_module("sch_%s", name);
1216 			rtnl_lock();
1217 			ops = qdisc_lookup_ops(kind);
1218 			if (ops != NULL) {
1219 				/* We will try again qdisc_lookup_ops,
1220 				 * so don't keep a reference.
1221 				 */
1222 				module_put(ops->owner);
1223 				err = -EAGAIN;
1224 				goto err_out;
1225 			}
1226 		}
1227 	}
1228 #endif
1229 
1230 	err = -ENOENT;
1231 	if (!ops) {
1232 		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1233 		goto err_out;
1234 	}
1235 
1236 	sch = qdisc_alloc(dev_queue, ops, extack);
1237 	if (IS_ERR(sch)) {
1238 		err = PTR_ERR(sch);
1239 		goto err_out2;
1240 	}
1241 
1242 	sch->parent = parent;
1243 
1244 	if (handle == TC_H_INGRESS) {
1245 		sch->flags |= TCQ_F_INGRESS;
1246 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1247 	} else {
1248 		if (handle == 0) {
1249 			handle = qdisc_alloc_handle(dev);
1250 			if (handle == 0) {
1251 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1252 				err = -ENOSPC;
1253 				goto err_out3;
1254 			}
1255 		}
1256 		if (!netif_is_multiqueue(dev))
1257 			sch->flags |= TCQ_F_ONETXQUEUE;
1258 	}
1259 
1260 	sch->handle = handle;
1261 
1262 	/* This exist to keep backward compatible with a userspace
1263 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1264 	 * facility on older kernels by setting tx_queue_len=0 (prior
1265 	 * to qdisc init), and then forgot to reinit tx_queue_len
1266 	 * before again attaching a qdisc.
1267 	 */
1268 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1269 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1270 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1271 	}
1272 
1273 	err = qdisc_block_indexes_set(sch, tca, extack);
1274 	if (err)
1275 		goto err_out3;
1276 
1277 	if (ops->init) {
1278 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1279 		if (err != 0)
1280 			goto err_out5;
1281 	}
1282 
1283 	if (tca[TCA_STAB]) {
1284 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1285 		if (IS_ERR(stab)) {
1286 			err = PTR_ERR(stab);
1287 			goto err_out4;
1288 		}
1289 		rcu_assign_pointer(sch->stab, stab);
1290 	}
1291 	if (tca[TCA_RATE]) {
1292 		err = -EOPNOTSUPP;
1293 		if (sch->flags & TCQ_F_MQROOT) {
1294 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1295 			goto err_out4;
1296 		}
1297 
1298 		err = gen_new_estimator(&sch->bstats,
1299 					sch->cpu_bstats,
1300 					&sch->rate_est,
1301 					NULL,
1302 					true,
1303 					tca[TCA_RATE]);
1304 		if (err) {
1305 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1306 			goto err_out4;
1307 		}
1308 	}
1309 
1310 	qdisc_hash_add(sch, false);
1311 	trace_qdisc_create(ops, dev, parent);
1312 
1313 	return sch;
1314 
1315 err_out5:
1316 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1317 	if (ops->destroy)
1318 		ops->destroy(sch);
1319 err_out3:
1320 	netdev_put(dev, &sch->dev_tracker);
1321 	qdisc_free(sch);
1322 err_out2:
1323 	module_put(ops->owner);
1324 err_out:
1325 	*errp = err;
1326 	return NULL;
1327 
1328 err_out4:
1329 	/*
1330 	 * Any broken qdiscs that would require a ops->reset() here?
1331 	 * The qdisc was never in action so it shouldn't be necessary.
1332 	 */
1333 	qdisc_put_stab(rtnl_dereference(sch->stab));
1334 	if (ops->destroy)
1335 		ops->destroy(sch);
1336 	goto err_out3;
1337 }
1338 
1339 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1340 			struct netlink_ext_ack *extack)
1341 {
1342 	struct qdisc_size_table *ostab, *stab = NULL;
1343 	int err = 0;
1344 
1345 	if (tca[TCA_OPTIONS]) {
1346 		if (!sch->ops->change) {
1347 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1348 			return -EINVAL;
1349 		}
1350 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1351 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1352 			return -EOPNOTSUPP;
1353 		}
1354 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1355 		if (err)
1356 			return err;
1357 	}
1358 
1359 	if (tca[TCA_STAB]) {
1360 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1361 		if (IS_ERR(stab))
1362 			return PTR_ERR(stab);
1363 	}
1364 
1365 	ostab = rtnl_dereference(sch->stab);
1366 	rcu_assign_pointer(sch->stab, stab);
1367 	qdisc_put_stab(ostab);
1368 
1369 	if (tca[TCA_RATE]) {
1370 		/* NB: ignores errors from replace_estimator
1371 		   because change can't be undone. */
1372 		if (sch->flags & TCQ_F_MQROOT)
1373 			goto out;
1374 		gen_replace_estimator(&sch->bstats,
1375 				      sch->cpu_bstats,
1376 				      &sch->rate_est,
1377 				      NULL,
1378 				      true,
1379 				      tca[TCA_RATE]);
1380 	}
1381 out:
1382 	return 0;
1383 }
1384 
1385 struct check_loop_arg {
1386 	struct qdisc_walker	w;
1387 	struct Qdisc		*p;
1388 	int			depth;
1389 };
1390 
1391 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1392 			 struct qdisc_walker *w);
1393 
1394 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1395 {
1396 	struct check_loop_arg	arg;
1397 
1398 	if (q->ops->cl_ops == NULL)
1399 		return 0;
1400 
1401 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1402 	arg.w.fn = check_loop_fn;
1403 	arg.depth = depth;
1404 	arg.p = p;
1405 	q->ops->cl_ops->walk(q, &arg.w);
1406 	return arg.w.stop ? -ELOOP : 0;
1407 }
1408 
1409 static int
1410 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1411 {
1412 	struct Qdisc *leaf;
1413 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1414 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1415 
1416 	leaf = cops->leaf(q, cl);
1417 	if (leaf) {
1418 		if (leaf == arg->p || arg->depth > 7)
1419 			return -ELOOP;
1420 		return check_loop(leaf, arg->p, arg->depth + 1);
1421 	}
1422 	return 0;
1423 }
1424 
1425 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1426 	[TCA_KIND]		= { .type = NLA_STRING },
1427 	[TCA_RATE]		= { .type = NLA_BINARY,
1428 				    .len = sizeof(struct tc_estimator) },
1429 	[TCA_STAB]		= { .type = NLA_NESTED },
1430 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1431 	[TCA_CHAIN]		= { .type = NLA_U32 },
1432 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1433 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1434 };
1435 
1436 /*
1437  * Delete/get qdisc.
1438  */
1439 
1440 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1441 			struct netlink_ext_ack *extack)
1442 {
1443 	struct net *net = sock_net(skb->sk);
1444 	struct tcmsg *tcm = nlmsg_data(n);
1445 	struct nlattr *tca[TCA_MAX + 1];
1446 	struct net_device *dev;
1447 	u32 clid;
1448 	struct Qdisc *q = NULL;
1449 	struct Qdisc *p = NULL;
1450 	int err;
1451 
1452 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1453 				     rtm_tca_policy, extack);
1454 	if (err < 0)
1455 		return err;
1456 
1457 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1458 	if (!dev)
1459 		return -ENODEV;
1460 
1461 	clid = tcm->tcm_parent;
1462 	if (clid) {
1463 		if (clid != TC_H_ROOT) {
1464 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1465 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1466 				if (!p) {
1467 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1468 					return -ENOENT;
1469 				}
1470 				q = qdisc_leaf(p, clid);
1471 			} else if (dev_ingress_queue(dev)) {
1472 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1473 			}
1474 		} else {
1475 			q = rtnl_dereference(dev->qdisc);
1476 		}
1477 		if (!q) {
1478 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1479 			return -ENOENT;
1480 		}
1481 
1482 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1483 			NL_SET_ERR_MSG(extack, "Invalid handle");
1484 			return -EINVAL;
1485 		}
1486 	} else {
1487 		q = qdisc_lookup(dev, tcm->tcm_handle);
1488 		if (!q) {
1489 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1490 			return -ENOENT;
1491 		}
1492 	}
1493 
1494 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1495 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1496 		return -EINVAL;
1497 	}
1498 
1499 	if (n->nlmsg_type == RTM_DELQDISC) {
1500 		if (!clid) {
1501 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1502 			return -EINVAL;
1503 		}
1504 		if (q->handle == 0) {
1505 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1506 			return -ENOENT;
1507 		}
1508 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1509 		if (err != 0)
1510 			return err;
1511 	} else {
1512 		qdisc_notify(net, skb, n, clid, NULL, q);
1513 	}
1514 	return 0;
1515 }
1516 
1517 /*
1518  * Create/change qdisc.
1519  */
1520 
1521 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1522 			   struct netlink_ext_ack *extack)
1523 {
1524 	struct net *net = sock_net(skb->sk);
1525 	struct tcmsg *tcm;
1526 	struct nlattr *tca[TCA_MAX + 1];
1527 	struct net_device *dev;
1528 	u32 clid;
1529 	struct Qdisc *q, *p;
1530 	int err;
1531 
1532 replay:
1533 	/* Reinit, just in case something touches this. */
1534 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1535 				     rtm_tca_policy, extack);
1536 	if (err < 0)
1537 		return err;
1538 
1539 	tcm = nlmsg_data(n);
1540 	clid = tcm->tcm_parent;
1541 	q = p = NULL;
1542 
1543 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1544 	if (!dev)
1545 		return -ENODEV;
1546 
1547 
1548 	if (clid) {
1549 		if (clid != TC_H_ROOT) {
1550 			if (clid != TC_H_INGRESS) {
1551 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1552 				if (!p) {
1553 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1554 					return -ENOENT;
1555 				}
1556 				q = qdisc_leaf(p, clid);
1557 			} else if (dev_ingress_queue_create(dev)) {
1558 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1559 			}
1560 		} else {
1561 			q = rtnl_dereference(dev->qdisc);
1562 		}
1563 
1564 		/* It may be default qdisc, ignore it */
1565 		if (q && q->handle == 0)
1566 			q = NULL;
1567 
1568 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1569 			if (tcm->tcm_handle) {
1570 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1571 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1572 					return -EEXIST;
1573 				}
1574 				if (TC_H_MIN(tcm->tcm_handle)) {
1575 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1576 					return -EINVAL;
1577 				}
1578 				q = qdisc_lookup(dev, tcm->tcm_handle);
1579 				if (!q)
1580 					goto create_n_graft;
1581 				if (n->nlmsg_flags & NLM_F_EXCL) {
1582 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1583 					return -EEXIST;
1584 				}
1585 				if (tca[TCA_KIND] &&
1586 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1587 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1588 					return -EINVAL;
1589 				}
1590 				if (q == p ||
1591 				    (p && check_loop(q, p, 0))) {
1592 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1593 					return -ELOOP;
1594 				}
1595 				qdisc_refcount_inc(q);
1596 				goto graft;
1597 			} else {
1598 				if (!q)
1599 					goto create_n_graft;
1600 
1601 				/* This magic test requires explanation.
1602 				 *
1603 				 *   We know, that some child q is already
1604 				 *   attached to this parent and have choice:
1605 				 *   either to change it or to create/graft new one.
1606 				 *
1607 				 *   1. We are allowed to create/graft only
1608 				 *   if CREATE and REPLACE flags are set.
1609 				 *
1610 				 *   2. If EXCL is set, requestor wanted to say,
1611 				 *   that qdisc tcm_handle is not expected
1612 				 *   to exist, so that we choose create/graft too.
1613 				 *
1614 				 *   3. The last case is when no flags are set.
1615 				 *   Alas, it is sort of hole in API, we
1616 				 *   cannot decide what to do unambiguously.
1617 				 *   For now we select create/graft, if
1618 				 *   user gave KIND, which does not match existing.
1619 				 */
1620 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1621 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1622 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1623 				     (tca[TCA_KIND] &&
1624 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1625 					goto create_n_graft;
1626 			}
1627 		}
1628 	} else {
1629 		if (!tcm->tcm_handle) {
1630 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1631 			return -EINVAL;
1632 		}
1633 		q = qdisc_lookup(dev, tcm->tcm_handle);
1634 	}
1635 
1636 	/* Change qdisc parameters */
1637 	if (!q) {
1638 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1639 		return -ENOENT;
1640 	}
1641 	if (n->nlmsg_flags & NLM_F_EXCL) {
1642 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1643 		return -EEXIST;
1644 	}
1645 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1646 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1647 		return -EINVAL;
1648 	}
1649 	err = qdisc_change(q, tca, extack);
1650 	if (err == 0)
1651 		qdisc_notify(net, skb, n, clid, NULL, q);
1652 	return err;
1653 
1654 create_n_graft:
1655 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1656 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1657 		return -ENOENT;
1658 	}
1659 	if (clid == TC_H_INGRESS) {
1660 		if (dev_ingress_queue(dev)) {
1661 			q = qdisc_create(dev, dev_ingress_queue(dev),
1662 					 tcm->tcm_parent, tcm->tcm_parent,
1663 					 tca, &err, extack);
1664 		} else {
1665 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1666 			err = -ENOENT;
1667 		}
1668 	} else {
1669 		struct netdev_queue *dev_queue;
1670 
1671 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1672 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1673 		else if (p)
1674 			dev_queue = p->dev_queue;
1675 		else
1676 			dev_queue = netdev_get_tx_queue(dev, 0);
1677 
1678 		q = qdisc_create(dev, dev_queue,
1679 				 tcm->tcm_parent, tcm->tcm_handle,
1680 				 tca, &err, extack);
1681 	}
1682 	if (q == NULL) {
1683 		if (err == -EAGAIN)
1684 			goto replay;
1685 		return err;
1686 	}
1687 
1688 graft:
1689 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1690 	if (err) {
1691 		if (q)
1692 			qdisc_put(q);
1693 		return err;
1694 	}
1695 
1696 	return 0;
1697 }
1698 
1699 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1700 			      struct netlink_callback *cb,
1701 			      int *q_idx_p, int s_q_idx, bool recur,
1702 			      bool dump_invisible)
1703 {
1704 	int ret = 0, q_idx = *q_idx_p;
1705 	struct Qdisc *q;
1706 	int b;
1707 
1708 	if (!root)
1709 		return 0;
1710 
1711 	q = root;
1712 	if (q_idx < s_q_idx) {
1713 		q_idx++;
1714 	} else {
1715 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1716 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1717 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1718 				  RTM_NEWQDISC) <= 0)
1719 			goto done;
1720 		q_idx++;
1721 	}
1722 
1723 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1724 	 * itself has already been dumped.
1725 	 *
1726 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1727 	 * qdisc hashtable, we don't want to hit it again
1728 	 */
1729 	if (!qdisc_dev(root) || !recur)
1730 		goto out;
1731 
1732 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1733 		if (q_idx < s_q_idx) {
1734 			q_idx++;
1735 			continue;
1736 		}
1737 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1738 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1739 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1740 				  RTM_NEWQDISC) <= 0)
1741 			goto done;
1742 		q_idx++;
1743 	}
1744 
1745 out:
1746 	*q_idx_p = q_idx;
1747 	return ret;
1748 done:
1749 	ret = -1;
1750 	goto out;
1751 }
1752 
1753 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1754 {
1755 	struct net *net = sock_net(skb->sk);
1756 	int idx, q_idx;
1757 	int s_idx, s_q_idx;
1758 	struct net_device *dev;
1759 	const struct nlmsghdr *nlh = cb->nlh;
1760 	struct nlattr *tca[TCA_MAX + 1];
1761 	int err;
1762 
1763 	s_idx = cb->args[0];
1764 	s_q_idx = q_idx = cb->args[1];
1765 
1766 	idx = 0;
1767 	ASSERT_RTNL();
1768 
1769 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1770 				     rtm_tca_policy, cb->extack);
1771 	if (err < 0)
1772 		return err;
1773 
1774 	for_each_netdev(net, dev) {
1775 		struct netdev_queue *dev_queue;
1776 
1777 		if (idx < s_idx)
1778 			goto cont;
1779 		if (idx > s_idx)
1780 			s_q_idx = 0;
1781 		q_idx = 0;
1782 
1783 		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1784 				       skb, cb, &q_idx, s_q_idx,
1785 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1786 			goto done;
1787 
1788 		dev_queue = dev_ingress_queue(dev);
1789 		if (dev_queue &&
1790 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1791 				       &q_idx, s_q_idx, false,
1792 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1793 			goto done;
1794 
1795 cont:
1796 		idx++;
1797 	}
1798 
1799 done:
1800 	cb->args[0] = idx;
1801 	cb->args[1] = q_idx;
1802 
1803 	return skb->len;
1804 }
1805 
1806 
1807 
1808 /************************************************
1809  *	Traffic classes manipulation.		*
1810  ************************************************/
1811 
1812 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1813 			  unsigned long cl,
1814 			  u32 portid, u32 seq, u16 flags, int event)
1815 {
1816 	struct tcmsg *tcm;
1817 	struct nlmsghdr  *nlh;
1818 	unsigned char *b = skb_tail_pointer(skb);
1819 	struct gnet_dump d;
1820 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1821 
1822 	cond_resched();
1823 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1824 	if (!nlh)
1825 		goto out_nlmsg_trim;
1826 	tcm = nlmsg_data(nlh);
1827 	tcm->tcm_family = AF_UNSPEC;
1828 	tcm->tcm__pad1 = 0;
1829 	tcm->tcm__pad2 = 0;
1830 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1831 	tcm->tcm_parent = q->handle;
1832 	tcm->tcm_handle = q->handle;
1833 	tcm->tcm_info = 0;
1834 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1835 		goto nla_put_failure;
1836 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1837 		goto nla_put_failure;
1838 
1839 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1840 					 NULL, &d, TCA_PAD) < 0)
1841 		goto nla_put_failure;
1842 
1843 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1844 		goto nla_put_failure;
1845 
1846 	if (gnet_stats_finish_copy(&d) < 0)
1847 		goto nla_put_failure;
1848 
1849 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1850 	return skb->len;
1851 
1852 out_nlmsg_trim:
1853 nla_put_failure:
1854 	nlmsg_trim(skb, b);
1855 	return -1;
1856 }
1857 
1858 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1859 			 struct nlmsghdr *n, struct Qdisc *q,
1860 			 unsigned long cl, int event)
1861 {
1862 	struct sk_buff *skb;
1863 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1864 
1865 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1866 	if (!skb)
1867 		return -ENOBUFS;
1868 
1869 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1870 		kfree_skb(skb);
1871 		return -EINVAL;
1872 	}
1873 
1874 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1875 			      n->nlmsg_flags & NLM_F_ECHO);
1876 }
1877 
1878 static int tclass_del_notify(struct net *net,
1879 			     const struct Qdisc_class_ops *cops,
1880 			     struct sk_buff *oskb, struct nlmsghdr *n,
1881 			     struct Qdisc *q, unsigned long cl,
1882 			     struct netlink_ext_ack *extack)
1883 {
1884 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1885 	struct sk_buff *skb;
1886 	int err = 0;
1887 
1888 	if (!cops->delete)
1889 		return -EOPNOTSUPP;
1890 
1891 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1892 	if (!skb)
1893 		return -ENOBUFS;
1894 
1895 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1896 			   RTM_DELTCLASS) < 0) {
1897 		kfree_skb(skb);
1898 		return -EINVAL;
1899 	}
1900 
1901 	err = cops->delete(q, cl, extack);
1902 	if (err) {
1903 		kfree_skb(skb);
1904 		return err;
1905 	}
1906 
1907 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1908 			     n->nlmsg_flags & NLM_F_ECHO);
1909 	return err;
1910 }
1911 
1912 #ifdef CONFIG_NET_CLS
1913 
1914 struct tcf_bind_args {
1915 	struct tcf_walker w;
1916 	unsigned long base;
1917 	unsigned long cl;
1918 	u32 classid;
1919 };
1920 
1921 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1922 {
1923 	struct tcf_bind_args *a = (void *)arg;
1924 
1925 	if (n && tp->ops->bind_class) {
1926 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1927 
1928 		sch_tree_lock(q);
1929 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1930 		sch_tree_unlock(q);
1931 	}
1932 	return 0;
1933 }
1934 
1935 struct tc_bind_class_args {
1936 	struct qdisc_walker w;
1937 	unsigned long new_cl;
1938 	u32 portid;
1939 	u32 clid;
1940 };
1941 
1942 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1943 				struct qdisc_walker *w)
1944 {
1945 	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1946 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1947 	struct tcf_block *block;
1948 	struct tcf_chain *chain;
1949 
1950 	block = cops->tcf_block(q, cl, NULL);
1951 	if (!block)
1952 		return 0;
1953 	for (chain = tcf_get_next_chain(block, NULL);
1954 	     chain;
1955 	     chain = tcf_get_next_chain(block, chain)) {
1956 		struct tcf_proto *tp;
1957 
1958 		for (tp = tcf_get_next_proto(chain, NULL);
1959 		     tp; tp = tcf_get_next_proto(chain, tp)) {
1960 			struct tcf_bind_args arg = {};
1961 
1962 			arg.w.fn = tcf_node_bind;
1963 			arg.classid = a->clid;
1964 			arg.base = cl;
1965 			arg.cl = a->new_cl;
1966 			tp->ops->walk(tp, &arg.w, true);
1967 		}
1968 	}
1969 
1970 	return 0;
1971 }
1972 
1973 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1974 			   unsigned long new_cl)
1975 {
1976 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1977 	struct tc_bind_class_args args = {};
1978 
1979 	if (!cops->tcf_block)
1980 		return;
1981 	args.portid = portid;
1982 	args.clid = clid;
1983 	args.new_cl = new_cl;
1984 	args.w.fn = tc_bind_class_walker;
1985 	q->ops->cl_ops->walk(q, &args.w);
1986 }
1987 
1988 #else
1989 
1990 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1991 			   unsigned long new_cl)
1992 {
1993 }
1994 
1995 #endif
1996 
1997 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1998 			 struct netlink_ext_ack *extack)
1999 {
2000 	struct net *net = sock_net(skb->sk);
2001 	struct tcmsg *tcm = nlmsg_data(n);
2002 	struct nlattr *tca[TCA_MAX + 1];
2003 	struct net_device *dev;
2004 	struct Qdisc *q = NULL;
2005 	const struct Qdisc_class_ops *cops;
2006 	unsigned long cl = 0;
2007 	unsigned long new_cl;
2008 	u32 portid;
2009 	u32 clid;
2010 	u32 qid;
2011 	int err;
2012 
2013 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2014 				     rtm_tca_policy, extack);
2015 	if (err < 0)
2016 		return err;
2017 
2018 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2019 	if (!dev)
2020 		return -ENODEV;
2021 
2022 	/*
2023 	   parent == TC_H_UNSPEC - unspecified parent.
2024 	   parent == TC_H_ROOT   - class is root, which has no parent.
2025 	   parent == X:0	 - parent is root class.
2026 	   parent == X:Y	 - parent is a node in hierarchy.
2027 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2028 
2029 	   handle == 0:0	 - generate handle from kernel pool.
2030 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2031 	   handle == X:Y	 - clear.
2032 	   handle == X:0	 - root class.
2033 	 */
2034 
2035 	/* Step 1. Determine qdisc handle X:0 */
2036 
2037 	portid = tcm->tcm_parent;
2038 	clid = tcm->tcm_handle;
2039 	qid = TC_H_MAJ(clid);
2040 
2041 	if (portid != TC_H_ROOT) {
2042 		u32 qid1 = TC_H_MAJ(portid);
2043 
2044 		if (qid && qid1) {
2045 			/* If both majors are known, they must be identical. */
2046 			if (qid != qid1)
2047 				return -EINVAL;
2048 		} else if (qid1) {
2049 			qid = qid1;
2050 		} else if (qid == 0)
2051 			qid = rtnl_dereference(dev->qdisc)->handle;
2052 
2053 		/* Now qid is genuine qdisc handle consistent
2054 		 * both with parent and child.
2055 		 *
2056 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2057 		 */
2058 		if (portid)
2059 			portid = TC_H_MAKE(qid, portid);
2060 	} else {
2061 		if (qid == 0)
2062 			qid = rtnl_dereference(dev->qdisc)->handle;
2063 	}
2064 
2065 	/* OK. Locate qdisc */
2066 	q = qdisc_lookup(dev, qid);
2067 	if (!q)
2068 		return -ENOENT;
2069 
2070 	/* An check that it supports classes */
2071 	cops = q->ops->cl_ops;
2072 	if (cops == NULL)
2073 		return -EINVAL;
2074 
2075 	/* Now try to get class */
2076 	if (clid == 0) {
2077 		if (portid == TC_H_ROOT)
2078 			clid = qid;
2079 	} else
2080 		clid = TC_H_MAKE(qid, clid);
2081 
2082 	if (clid)
2083 		cl = cops->find(q, clid);
2084 
2085 	if (cl == 0) {
2086 		err = -ENOENT;
2087 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2088 		    !(n->nlmsg_flags & NLM_F_CREATE))
2089 			goto out;
2090 	} else {
2091 		switch (n->nlmsg_type) {
2092 		case RTM_NEWTCLASS:
2093 			err = -EEXIST;
2094 			if (n->nlmsg_flags & NLM_F_EXCL)
2095 				goto out;
2096 			break;
2097 		case RTM_DELTCLASS:
2098 			err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2099 			/* Unbind the class with flilters with 0 */
2100 			tc_bind_tclass(q, portid, clid, 0);
2101 			goto out;
2102 		case RTM_GETTCLASS:
2103 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2104 			goto out;
2105 		default:
2106 			err = -EINVAL;
2107 			goto out;
2108 		}
2109 	}
2110 
2111 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2112 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2113 		return -EOPNOTSUPP;
2114 	}
2115 
2116 	new_cl = cl;
2117 	err = -EOPNOTSUPP;
2118 	if (cops->change)
2119 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2120 	if (err == 0) {
2121 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2122 		/* We just create a new class, need to do reverse binding. */
2123 		if (cl != new_cl)
2124 			tc_bind_tclass(q, portid, clid, new_cl);
2125 	}
2126 out:
2127 	return err;
2128 }
2129 
2130 struct qdisc_dump_args {
2131 	struct qdisc_walker	w;
2132 	struct sk_buff		*skb;
2133 	struct netlink_callback	*cb;
2134 };
2135 
2136 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2137 			    struct qdisc_walker *arg)
2138 {
2139 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2140 
2141 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2142 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2143 			      RTM_NEWTCLASS);
2144 }
2145 
2146 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2147 				struct tcmsg *tcm, struct netlink_callback *cb,
2148 				int *t_p, int s_t)
2149 {
2150 	struct qdisc_dump_args arg;
2151 
2152 	if (tc_qdisc_dump_ignore(q, false) ||
2153 	    *t_p < s_t || !q->ops->cl_ops ||
2154 	    (tcm->tcm_parent &&
2155 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2156 		(*t_p)++;
2157 		return 0;
2158 	}
2159 	if (*t_p > s_t)
2160 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2161 	arg.w.fn = qdisc_class_dump;
2162 	arg.skb = skb;
2163 	arg.cb = cb;
2164 	arg.w.stop  = 0;
2165 	arg.w.skip = cb->args[1];
2166 	arg.w.count = 0;
2167 	q->ops->cl_ops->walk(q, &arg.w);
2168 	cb->args[1] = arg.w.count;
2169 	if (arg.w.stop)
2170 		return -1;
2171 	(*t_p)++;
2172 	return 0;
2173 }
2174 
2175 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2176 			       struct tcmsg *tcm, struct netlink_callback *cb,
2177 			       int *t_p, int s_t, bool recur)
2178 {
2179 	struct Qdisc *q;
2180 	int b;
2181 
2182 	if (!root)
2183 		return 0;
2184 
2185 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2186 		return -1;
2187 
2188 	if (!qdisc_dev(root) || !recur)
2189 		return 0;
2190 
2191 	if (tcm->tcm_parent) {
2192 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2193 		if (q && q != root &&
2194 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2195 			return -1;
2196 		return 0;
2197 	}
2198 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2199 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2200 			return -1;
2201 	}
2202 
2203 	return 0;
2204 }
2205 
2206 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2207 {
2208 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2209 	struct net *net = sock_net(skb->sk);
2210 	struct netdev_queue *dev_queue;
2211 	struct net_device *dev;
2212 	int t, s_t;
2213 
2214 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2215 		return 0;
2216 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2217 	if (!dev)
2218 		return 0;
2219 
2220 	s_t = cb->args[0];
2221 	t = 0;
2222 
2223 	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2224 				skb, tcm, cb, &t, s_t, true) < 0)
2225 		goto done;
2226 
2227 	dev_queue = dev_ingress_queue(dev);
2228 	if (dev_queue &&
2229 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2230 				&t, s_t, false) < 0)
2231 		goto done;
2232 
2233 done:
2234 	cb->args[0] = t;
2235 
2236 	dev_put(dev);
2237 	return skb->len;
2238 }
2239 
2240 #ifdef CONFIG_PROC_FS
2241 static int psched_show(struct seq_file *seq, void *v)
2242 {
2243 	seq_printf(seq, "%08x %08x %08x %08x\n",
2244 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2245 		   1000000,
2246 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2247 
2248 	return 0;
2249 }
2250 
2251 static int __net_init psched_net_init(struct net *net)
2252 {
2253 	struct proc_dir_entry *e;
2254 
2255 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2256 	if (e == NULL)
2257 		return -ENOMEM;
2258 
2259 	return 0;
2260 }
2261 
2262 static void __net_exit psched_net_exit(struct net *net)
2263 {
2264 	remove_proc_entry("psched", net->proc_net);
2265 }
2266 #else
2267 static int __net_init psched_net_init(struct net *net)
2268 {
2269 	return 0;
2270 }
2271 
2272 static void __net_exit psched_net_exit(struct net *net)
2273 {
2274 }
2275 #endif
2276 
2277 static struct pernet_operations psched_net_ops = {
2278 	.init = psched_net_init,
2279 	.exit = psched_net_exit,
2280 };
2281 
2282 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2283 
2284 static int __init pktsched_init(void)
2285 {
2286 	int err;
2287 
2288 	err = register_pernet_subsys(&psched_net_ops);
2289 	if (err) {
2290 		pr_err("pktsched_init: "
2291 		       "cannot initialize per netns operations\n");
2292 		return err;
2293 	}
2294 
2295 	register_qdisc(&pfifo_fast_ops);
2296 	register_qdisc(&pfifo_qdisc_ops);
2297 	register_qdisc(&bfifo_qdisc_ops);
2298 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2299 	register_qdisc(&mq_qdisc_ops);
2300 	register_qdisc(&noqueue_qdisc_ops);
2301 
2302 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2303 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2304 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2305 		      0);
2306 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2307 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2308 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2309 		      0);
2310 
2311 	tc_wrapper_init();
2312 
2313 	return 0;
2314 }
2315 
2316 subsys_initcall(pktsched_init);
2317