xref: /openbmc/linux/net/sched/sch_api.c (revision e5242c5f)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c	Packet scheduler API.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13 
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28 
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35 
36 #include <trace/events/qdisc.h>
37 
38 /*
39 
40    Short review.
41    -------------
42 
43    This file consists of two interrelated parts:
44 
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47 
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52 
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57 
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60 
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66 
67    All real intelligent work is done inside qdisc modules.
68 
69 
70 
71    Every discipline has two major routines: enqueue and dequeue.
72 
73    ---dequeue
74 
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81 
82    ---enqueue
83 
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP 	- this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91 
92    Auxiliary routines:
93 
94    ---peek
95 
96    like dequeue but without removing a packet from the queue
97 
98    ---reset
99 
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102 
103    ---init
104 
105    initializes newly created qdisc.
106 
107    ---destroy
108 
109    destroys resources allocated by init and during lifetime of qdisc.
110 
111    ---change
112 
113    changes qdisc parameters.
114  */
115 
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118 
119 
120 /************************************************
121  *	Queueing disciplines manipulation.	*
122  ************************************************/
123 
124 
125 /* The list of all installed queueing disciplines. */
126 
127 static struct Qdisc_ops *qdisc_base;
128 
129 /* Register/unregister queueing discipline */
130 
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133 	struct Qdisc_ops *q, **qp;
134 	int rc = -EEXIST;
135 
136 	write_lock(&qdisc_mod_lock);
137 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138 		if (!strcmp(qops->id, q->id))
139 			goto out;
140 
141 	if (qops->enqueue == NULL)
142 		qops->enqueue = noop_qdisc_ops.enqueue;
143 	if (qops->peek == NULL) {
144 		if (qops->dequeue == NULL)
145 			qops->peek = noop_qdisc_ops.peek;
146 		else
147 			goto out_einval;
148 	}
149 	if (qops->dequeue == NULL)
150 		qops->dequeue = noop_qdisc_ops.dequeue;
151 
152 	if (qops->cl_ops) {
153 		const struct Qdisc_class_ops *cops = qops->cl_ops;
154 
155 		if (!(cops->find && cops->walk && cops->leaf))
156 			goto out_einval;
157 
158 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159 			goto out_einval;
160 	}
161 
162 	qops->next = NULL;
163 	*qp = qops;
164 	rc = 0;
165 out:
166 	write_unlock(&qdisc_mod_lock);
167 	return rc;
168 
169 out_einval:
170 	rc = -EINVAL;
171 	goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174 
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177 	struct Qdisc_ops *q, **qp;
178 	int err = -ENOENT;
179 
180 	write_lock(&qdisc_mod_lock);
181 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182 		if (q == qops)
183 			break;
184 	if (q) {
185 		*qp = q->next;
186 		q->next = NULL;
187 		err = 0;
188 	}
189 	write_unlock(&qdisc_mod_lock);
190 
191 	WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198 	read_lock(&qdisc_mod_lock);
199 	strscpy(name, default_qdisc_ops->id, len);
200 	read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 	struct Qdisc_ops *q = NULL;
206 
207 	for (q = qdisc_base; q; q = q->next) {
208 		if (!strcmp(name, q->id)) {
209 			if (!try_module_get(q->owner))
210 				q = NULL;
211 			break;
212 		}
213 	}
214 
215 	return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221 	const struct Qdisc_ops *ops;
222 
223 	if (!capable(CAP_NET_ADMIN))
224 		return -EPERM;
225 
226 	write_lock(&qdisc_mod_lock);
227 	ops = qdisc_lookup_default(name);
228 	if (!ops) {
229 		/* Not found, drop lock and try to load module */
230 		write_unlock(&qdisc_mod_lock);
231 		request_module("sch_%s", name);
232 		write_lock(&qdisc_mod_lock);
233 
234 		ops = qdisc_lookup_default(name);
235 	}
236 
237 	if (ops) {
238 		/* Set new default */
239 		module_put(default_qdisc_ops->owner);
240 		default_qdisc_ops = ops;
241 	}
242 	write_unlock(&qdisc_mod_lock);
243 
244 	return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 	struct Qdisc *q;
264 
265 	if (!qdisc_dev(root))
266 		return (root->handle == handle ? root : NULL);
267 
268 	if (!(root->flags & TCQ_F_BUILTIN) &&
269 	    root->handle == handle)
270 		return root;
271 
272 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273 				   lockdep_rtnl_is_held()) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313 			handle);
314 out:
315 	return q;
316 }
317 
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320 	struct netdev_queue *nq;
321 	struct Qdisc *q;
322 
323 	if (!handle)
324 		return NULL;
325 	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326 	if (q)
327 		goto out;
328 
329 	nq = dev_ingress_queue_rcu(dev);
330 	if (nq)
331 		q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332 					  handle);
333 out:
334 	return q;
335 }
336 
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339 	unsigned long cl;
340 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341 
342 	if (cops == NULL)
343 		return NULL;
344 	cl = cops->find(p, classid);
345 
346 	if (cl == 0)
347 		return NULL;
348 	return cops->leaf(p, cl);
349 }
350 
351 /* Find queueing discipline by name */
352 
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355 	struct Qdisc_ops *q = NULL;
356 
357 	if (kind) {
358 		read_lock(&qdisc_mod_lock);
359 		for (q = qdisc_base; q; q = q->next) {
360 			if (nla_strcmp(kind, q->id) == 0) {
361 				if (!try_module_get(q->owner))
362 					q = NULL;
363 				break;
364 			}
365 		}
366 		read_unlock(&qdisc_mod_lock);
367 	}
368 	return q;
369 }
370 
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390 	int low       = roundup(r->mpu, 48);
391 	int high      = roundup(low+1, 48);
392 	int cell_low  = low >> r->cell_log;
393 	int cell_high = (high >> r->cell_log) - 1;
394 
395 	/* rtab is too inaccurate at rates > 100Mbit/s */
396 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397 		pr_debug("TC linklayer: Giving up ATM detection\n");
398 		return TC_LINKLAYER_ETHERNET;
399 	}
400 
401 	if ((cell_high > cell_low) && (cell_high < 256)
402 	    && (rtab[cell_low] == rtab[cell_high])) {
403 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404 			 cell_low, cell_high, rtab[cell_high]);
405 		return TC_LINKLAYER_ATM;
406 	}
407 	return TC_LINKLAYER_ETHERNET;
408 }
409 
410 static struct qdisc_rate_table *qdisc_rtab_list;
411 
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413 					struct nlattr *tab,
414 					struct netlink_ext_ack *extack)
415 {
416 	struct qdisc_rate_table *rtab;
417 
418 	if (tab == NULL || r->rate == 0 ||
419 	    r->cell_log == 0 || r->cell_log >= 32 ||
420 	    nla_len(tab) != TC_RTAB_SIZE) {
421 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422 		return NULL;
423 	}
424 
425 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
428 			rtab->refcnt++;
429 			return rtab;
430 		}
431 	}
432 
433 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434 	if (rtab) {
435 		rtab->rate = *r;
436 		rtab->refcnt = 1;
437 		memcpy(rtab->data, nla_data(tab), 1024);
438 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
439 			r->linklayer = __detect_linklayer(r, rtab->data);
440 		rtab->next = qdisc_rtab_list;
441 		qdisc_rtab_list = rtab;
442 	} else {
443 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444 	}
445 	return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448 
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451 	struct qdisc_rate_table *rtab, **rtabp;
452 
453 	if (!tab || --tab->refcnt)
454 		return;
455 
456 	for (rtabp = &qdisc_rtab_list;
457 	     (rtab = *rtabp) != NULL;
458 	     rtabp = &rtab->next) {
459 		if (rtab == tab) {
460 			*rtabp = rtab->next;
461 			kfree(rtab);
462 			return;
463 		}
464 	}
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467 
468 static LIST_HEAD(qdisc_stab_list);
469 
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
472 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474 
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476 					       struct netlink_ext_ack *extack)
477 {
478 	struct nlattr *tb[TCA_STAB_MAX + 1];
479 	struct qdisc_size_table *stab;
480 	struct tc_sizespec *s;
481 	unsigned int tsize = 0;
482 	u16 *tab = NULL;
483 	int err;
484 
485 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486 					  extack);
487 	if (err < 0)
488 		return ERR_PTR(err);
489 	if (!tb[TCA_STAB_BASE]) {
490 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491 		return ERR_PTR(-EINVAL);
492 	}
493 
494 	s = nla_data(tb[TCA_STAB_BASE]);
495 
496 	if (s->tsize > 0) {
497 		if (!tb[TCA_STAB_DATA]) {
498 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499 			return ERR_PTR(-EINVAL);
500 		}
501 		tab = nla_data(tb[TCA_STAB_DATA]);
502 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503 	}
504 
505 	if (tsize != s->tsize || (!tab && tsize > 0)) {
506 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
507 		return ERR_PTR(-EINVAL);
508 	}
509 
510 	list_for_each_entry(stab, &qdisc_stab_list, list) {
511 		if (memcmp(&stab->szopts, s, sizeof(*s)))
512 			continue;
513 		if (tsize > 0 &&
514 		    memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515 			continue;
516 		stab->refcnt++;
517 		return stab;
518 	}
519 
520 	if (s->size_log > STAB_SIZE_LOG_MAX ||
521 	    s->cell_log > STAB_SIZE_LOG_MAX) {
522 		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523 		return ERR_PTR(-EINVAL);
524 	}
525 
526 	stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527 	if (!stab)
528 		return ERR_PTR(-ENOMEM);
529 
530 	stab->refcnt = 1;
531 	stab->szopts = *s;
532 	if (tsize > 0)
533 		memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534 
535 	list_add_tail(&stab->list, &qdisc_stab_list);
536 
537 	return stab;
538 }
539 
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542 	if (!tab)
543 		return;
544 
545 	if (--tab->refcnt == 0) {
546 		list_del(&tab->list);
547 		kfree_rcu(tab, rcu);
548 	}
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551 
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554 	struct nlattr *nest;
555 
556 	nest = nla_nest_start_noflag(skb, TCA_STAB);
557 	if (nest == NULL)
558 		goto nla_put_failure;
559 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560 		goto nla_put_failure;
561 	nla_nest_end(skb, nest);
562 
563 	return skb->len;
564 
565 nla_put_failure:
566 	return -1;
567 }
568 
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570 			       const struct qdisc_size_table *stab)
571 {
572 	int pkt_len, slot;
573 
574 	pkt_len = skb->len + stab->szopts.overhead;
575 	if (unlikely(!stab->szopts.tsize))
576 		goto out;
577 
578 	slot = pkt_len + stab->szopts.cell_align;
579 	if (unlikely(slot < 0))
580 		slot = 0;
581 
582 	slot >>= stab->szopts.cell_log;
583 	if (likely(slot < stab->szopts.tsize))
584 		pkt_len = stab->data[slot];
585 	else
586 		pkt_len = stab->data[stab->szopts.tsize - 1] *
587 				(slot / stab->szopts.tsize) +
588 				stab->data[slot % stab->szopts.tsize];
589 
590 	pkt_len <<= stab->szopts.size_log;
591 out:
592 	if (unlikely(pkt_len < 1))
593 		pkt_len = 1;
594 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601 			txt, qdisc->ops->id, qdisc->handle >> 16);
602 		qdisc->flags |= TCQ_F_WARN_NONWC;
603 	}
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606 
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610 						 timer);
611 
612 	rcu_read_lock();
613 	__netif_schedule(qdisc_root(wd->qdisc));
614 	rcu_read_unlock();
615 
616 	return HRTIMER_NORESTART;
617 }
618 
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620 				 clockid_t clockid)
621 {
622 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623 	wd->timer.function = qdisc_watchdog;
624 	wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627 
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633 
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635 				      u64 delta_ns)
636 {
637 	bool deactivated;
638 
639 	rcu_read_lock();
640 	deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641 			       &qdisc_root_sleeping(wd->qdisc)->state);
642 	rcu_read_unlock();
643 	if (deactivated)
644 		return;
645 
646 	if (hrtimer_is_queued(&wd->timer)) {
647 		u64 softexpires;
648 
649 		softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
650 		/* If timer is already set in [expires, expires + delta_ns],
651 		 * do not reprogram it.
652 		 */
653 		if (softexpires - expires <= delta_ns)
654 			return;
655 	}
656 
657 	hrtimer_start_range_ns(&wd->timer,
658 			       ns_to_ktime(expires),
659 			       delta_ns,
660 			       HRTIMER_MODE_ABS_PINNED);
661 }
662 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
663 
664 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
665 {
666 	hrtimer_cancel(&wd->timer);
667 }
668 EXPORT_SYMBOL(qdisc_watchdog_cancel);
669 
670 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
671 {
672 	struct hlist_head *h;
673 	unsigned int i;
674 
675 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
676 
677 	if (h != NULL) {
678 		for (i = 0; i < n; i++)
679 			INIT_HLIST_HEAD(&h[i]);
680 	}
681 	return h;
682 }
683 
684 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
685 {
686 	struct Qdisc_class_common *cl;
687 	struct hlist_node *next;
688 	struct hlist_head *nhash, *ohash;
689 	unsigned int nsize, nmask, osize;
690 	unsigned int i, h;
691 
692 	/* Rehash when load factor exceeds 0.75 */
693 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
694 		return;
695 	nsize = clhash->hashsize * 2;
696 	nmask = nsize - 1;
697 	nhash = qdisc_class_hash_alloc(nsize);
698 	if (nhash == NULL)
699 		return;
700 
701 	ohash = clhash->hash;
702 	osize = clhash->hashsize;
703 
704 	sch_tree_lock(sch);
705 	for (i = 0; i < osize; i++) {
706 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
707 			h = qdisc_class_hash(cl->classid, nmask);
708 			hlist_add_head(&cl->hnode, &nhash[h]);
709 		}
710 	}
711 	clhash->hash     = nhash;
712 	clhash->hashsize = nsize;
713 	clhash->hashmask = nmask;
714 	sch_tree_unlock(sch);
715 
716 	kvfree(ohash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_grow);
719 
720 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
721 {
722 	unsigned int size = 4;
723 
724 	clhash->hash = qdisc_class_hash_alloc(size);
725 	if (!clhash->hash)
726 		return -ENOMEM;
727 	clhash->hashsize  = size;
728 	clhash->hashmask  = size - 1;
729 	clhash->hashelems = 0;
730 	return 0;
731 }
732 EXPORT_SYMBOL(qdisc_class_hash_init);
733 
734 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
735 {
736 	kvfree(clhash->hash);
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_destroy);
739 
740 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
741 			     struct Qdisc_class_common *cl)
742 {
743 	unsigned int h;
744 
745 	INIT_HLIST_NODE(&cl->hnode);
746 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
747 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
748 	clhash->hashelems++;
749 }
750 EXPORT_SYMBOL(qdisc_class_hash_insert);
751 
752 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
753 			     struct Qdisc_class_common *cl)
754 {
755 	hlist_del(&cl->hnode);
756 	clhash->hashelems--;
757 }
758 EXPORT_SYMBOL(qdisc_class_hash_remove);
759 
760 /* Allocate an unique handle from space managed by kernel
761  * Possible range is [8000-FFFF]:0000 (0x8000 values)
762  */
763 static u32 qdisc_alloc_handle(struct net_device *dev)
764 {
765 	int i = 0x8000;
766 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
767 
768 	do {
769 		autohandle += TC_H_MAKE(0x10000U, 0);
770 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
771 			autohandle = TC_H_MAKE(0x80000000U, 0);
772 		if (!qdisc_lookup(dev, autohandle))
773 			return autohandle;
774 		cond_resched();
775 	} while	(--i > 0);
776 
777 	return 0;
778 }
779 
780 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
781 {
782 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
783 	const struct Qdisc_class_ops *cops;
784 	unsigned long cl;
785 	u32 parentid;
786 	bool notify;
787 	int drops;
788 
789 	if (n == 0 && len == 0)
790 		return;
791 	drops = max_t(int, n, 0);
792 	rcu_read_lock();
793 	while ((parentid = sch->parent)) {
794 		if (parentid == TC_H_ROOT)
795 			break;
796 
797 		if (sch->flags & TCQ_F_NOPARENT)
798 			break;
799 		/* Notify parent qdisc only if child qdisc becomes empty.
800 		 *
801 		 * If child was empty even before update then backlog
802 		 * counter is screwed and we skip notification because
803 		 * parent class is already passive.
804 		 *
805 		 * If the original child was offloaded then it is allowed
806 		 * to be seem as empty, so the parent is notified anyway.
807 		 */
808 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
809 						       !qdisc_is_offloaded);
810 		/* TODO: perform the search on a per txq basis */
811 		sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
812 		if (sch == NULL) {
813 			WARN_ON_ONCE(parentid != TC_H_ROOT);
814 			break;
815 		}
816 		cops = sch->ops->cl_ops;
817 		if (notify && cops->qlen_notify) {
818 			cl = cops->find(sch, parentid);
819 			cops->qlen_notify(sch, cl);
820 		}
821 		sch->q.qlen -= n;
822 		sch->qstats.backlog -= len;
823 		__qdisc_qstats_drop(sch, drops);
824 	}
825 	rcu_read_unlock();
826 }
827 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
828 
829 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
830 			      void *type_data)
831 {
832 	struct net_device *dev = qdisc_dev(sch);
833 	int err;
834 
835 	sch->flags &= ~TCQ_F_OFFLOADED;
836 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
837 		return 0;
838 
839 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
840 	if (err == -EOPNOTSUPP)
841 		return 0;
842 
843 	if (!err)
844 		sch->flags |= TCQ_F_OFFLOADED;
845 
846 	return err;
847 }
848 EXPORT_SYMBOL(qdisc_offload_dump_helper);
849 
850 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
851 				struct Qdisc *new, struct Qdisc *old,
852 				enum tc_setup_type type, void *type_data,
853 				struct netlink_ext_ack *extack)
854 {
855 	bool any_qdisc_is_offloaded;
856 	int err;
857 
858 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
859 		return;
860 
861 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
862 
863 	/* Don't report error if the graft is part of destroy operation. */
864 	if (!err || !new || new == &noop_qdisc)
865 		return;
866 
867 	/* Don't report error if the parent, the old child and the new
868 	 * one are not offloaded.
869 	 */
870 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
871 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
872 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
873 
874 	if (any_qdisc_is_offloaded)
875 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
876 }
877 EXPORT_SYMBOL(qdisc_offload_graft_helper);
878 
879 void qdisc_offload_query_caps(struct net_device *dev,
880 			      enum tc_setup_type type,
881 			      void *caps, size_t caps_len)
882 {
883 	const struct net_device_ops *ops = dev->netdev_ops;
884 	struct tc_query_caps_base base = {
885 		.type = type,
886 		.caps = caps,
887 	};
888 
889 	memset(caps, 0, caps_len);
890 
891 	if (ops->ndo_setup_tc)
892 		ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
893 }
894 EXPORT_SYMBOL(qdisc_offload_query_caps);
895 
896 static void qdisc_offload_graft_root(struct net_device *dev,
897 				     struct Qdisc *new, struct Qdisc *old,
898 				     struct netlink_ext_ack *extack)
899 {
900 	struct tc_root_qopt_offload graft_offload = {
901 		.command	= TC_ROOT_GRAFT,
902 		.handle		= new ? new->handle : 0,
903 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
904 				  (old && old->flags & TCQ_F_INGRESS),
905 	};
906 
907 	qdisc_offload_graft_helper(dev, NULL, new, old,
908 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
909 }
910 
911 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
912 			 u32 portid, u32 seq, u16 flags, int event,
913 			 struct netlink_ext_ack *extack)
914 {
915 	struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
916 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
917 	struct tcmsg *tcm;
918 	struct nlmsghdr  *nlh;
919 	unsigned char *b = skb_tail_pointer(skb);
920 	struct gnet_dump d;
921 	struct qdisc_size_table *stab;
922 	u32 block_index;
923 	__u32 qlen;
924 
925 	cond_resched();
926 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
927 	if (!nlh)
928 		goto out_nlmsg_trim;
929 	tcm = nlmsg_data(nlh);
930 	tcm->tcm_family = AF_UNSPEC;
931 	tcm->tcm__pad1 = 0;
932 	tcm->tcm__pad2 = 0;
933 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
934 	tcm->tcm_parent = clid;
935 	tcm->tcm_handle = q->handle;
936 	tcm->tcm_info = refcount_read(&q->refcnt);
937 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
938 		goto nla_put_failure;
939 	if (q->ops->ingress_block_get) {
940 		block_index = q->ops->ingress_block_get(q);
941 		if (block_index &&
942 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
943 			goto nla_put_failure;
944 	}
945 	if (q->ops->egress_block_get) {
946 		block_index = q->ops->egress_block_get(q);
947 		if (block_index &&
948 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
949 			goto nla_put_failure;
950 	}
951 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
952 		goto nla_put_failure;
953 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
954 		goto nla_put_failure;
955 	qlen = qdisc_qlen_sum(q);
956 
957 	stab = rtnl_dereference(q->stab);
958 	if (stab && qdisc_dump_stab(skb, stab) < 0)
959 		goto nla_put_failure;
960 
961 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
962 					 NULL, &d, TCA_PAD) < 0)
963 		goto nla_put_failure;
964 
965 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
966 		goto nla_put_failure;
967 
968 	if (qdisc_is_percpu_stats(q)) {
969 		cpu_bstats = q->cpu_bstats;
970 		cpu_qstats = q->cpu_qstats;
971 	}
972 
973 	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
974 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
975 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
976 		goto nla_put_failure;
977 
978 	if (gnet_stats_finish_copy(&d) < 0)
979 		goto nla_put_failure;
980 
981 	if (extack && extack->_msg &&
982 	    nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
983 		goto out_nlmsg_trim;
984 
985 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
986 
987 	return skb->len;
988 
989 out_nlmsg_trim:
990 nla_put_failure:
991 	nlmsg_trim(skb, b);
992 	return -1;
993 }
994 
995 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
996 {
997 	if (q->flags & TCQ_F_BUILTIN)
998 		return true;
999 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1000 		return true;
1001 
1002 	return false;
1003 }
1004 
1005 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1006 			struct nlmsghdr *n, u32 clid,
1007 			struct Qdisc *old, struct Qdisc *new,
1008 			struct netlink_ext_ack *extack)
1009 {
1010 	struct sk_buff *skb;
1011 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1012 
1013 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1014 	if (!skb)
1015 		return -ENOBUFS;
1016 
1017 	if (old && !tc_qdisc_dump_ignore(old, false)) {
1018 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1019 				  0, RTM_DELQDISC, extack) < 0)
1020 			goto err_out;
1021 	}
1022 	if (new && !tc_qdisc_dump_ignore(new, false)) {
1023 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1024 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1025 			goto err_out;
1026 	}
1027 
1028 	if (skb->len)
1029 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1030 				      n->nlmsg_flags & NLM_F_ECHO);
1031 
1032 err_out:
1033 	kfree_skb(skb);
1034 	return -EINVAL;
1035 }
1036 
1037 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1038 			       struct nlmsghdr *n, u32 clid,
1039 			       struct Qdisc *old, struct Qdisc *new,
1040 			       struct netlink_ext_ack *extack)
1041 {
1042 	if (new || old)
1043 		qdisc_notify(net, skb, n, clid, old, new, extack);
1044 
1045 	if (old)
1046 		qdisc_put(old);
1047 }
1048 
1049 static void qdisc_clear_nolock(struct Qdisc *sch)
1050 {
1051 	sch->flags &= ~TCQ_F_NOLOCK;
1052 	if (!(sch->flags & TCQ_F_CPUSTATS))
1053 		return;
1054 
1055 	free_percpu(sch->cpu_bstats);
1056 	free_percpu(sch->cpu_qstats);
1057 	sch->cpu_bstats = NULL;
1058 	sch->cpu_qstats = NULL;
1059 	sch->flags &= ~TCQ_F_CPUSTATS;
1060 }
1061 
1062 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1063  * to device "dev".
1064  *
1065  * When appropriate send a netlink notification using 'skb'
1066  * and "n".
1067  *
1068  * On success, destroy old qdisc.
1069  */
1070 
1071 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1072 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1073 		       struct Qdisc *new, struct Qdisc *old,
1074 		       struct netlink_ext_ack *extack)
1075 {
1076 	struct Qdisc *q = old;
1077 	struct net *net = dev_net(dev);
1078 
1079 	if (parent == NULL) {
1080 		unsigned int i, num_q, ingress;
1081 		struct netdev_queue *dev_queue;
1082 
1083 		ingress = 0;
1084 		num_q = dev->num_tx_queues;
1085 		if ((q && q->flags & TCQ_F_INGRESS) ||
1086 		    (new && new->flags & TCQ_F_INGRESS)) {
1087 			ingress = 1;
1088 			dev_queue = dev_ingress_queue(dev);
1089 			if (!dev_queue) {
1090 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1091 				return -ENOENT;
1092 			}
1093 
1094 			q = rtnl_dereference(dev_queue->qdisc_sleeping);
1095 
1096 			/* This is the counterpart of that qdisc_refcount_inc_nz() call in
1097 			 * __tcf_qdisc_find() for filter requests.
1098 			 */
1099 			if (!qdisc_refcount_dec_if_one(q)) {
1100 				NL_SET_ERR_MSG(extack,
1101 					       "Current ingress or clsact Qdisc has ongoing filter requests");
1102 				return -EBUSY;
1103 			}
1104 		}
1105 
1106 		if (dev->flags & IFF_UP)
1107 			dev_deactivate(dev);
1108 
1109 		qdisc_offload_graft_root(dev, new, old, extack);
1110 
1111 		if (new && new->ops->attach && !ingress)
1112 			goto skip;
1113 
1114 		if (!ingress) {
1115 			for (i = 0; i < num_q; i++) {
1116 				dev_queue = netdev_get_tx_queue(dev, i);
1117 				old = dev_graft_qdisc(dev_queue, new);
1118 
1119 				if (new && i > 0)
1120 					qdisc_refcount_inc(new);
1121 				qdisc_put(old);
1122 			}
1123 		} else {
1124 			old = dev_graft_qdisc(dev_queue, NULL);
1125 
1126 			/* {ingress,clsact}_destroy() @old before grafting @new to avoid
1127 			 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1128 			 * pointer(s) in mini_qdisc_pair_swap().
1129 			 */
1130 			qdisc_notify(net, skb, n, classid, old, new, extack);
1131 			qdisc_destroy(old);
1132 
1133 			dev_graft_qdisc(dev_queue, new);
1134 		}
1135 
1136 skip:
1137 		if (!ingress) {
1138 			old = rtnl_dereference(dev->qdisc);
1139 			if (new && !new->ops->attach)
1140 				qdisc_refcount_inc(new);
1141 			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1142 
1143 			notify_and_destroy(net, skb, n, classid, old, new, extack);
1144 
1145 			if (new && new->ops->attach)
1146 				new->ops->attach(new);
1147 		}
1148 
1149 		if (dev->flags & IFF_UP)
1150 			dev_activate(dev);
1151 	} else {
1152 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1153 		unsigned long cl;
1154 		int err;
1155 
1156 		/* Only support running class lockless if parent is lockless */
1157 		if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1158 			qdisc_clear_nolock(new);
1159 
1160 		if (!cops || !cops->graft)
1161 			return -EOPNOTSUPP;
1162 
1163 		cl = cops->find(parent, classid);
1164 		if (!cl) {
1165 			NL_SET_ERR_MSG(extack, "Specified class not found");
1166 			return -ENOENT;
1167 		}
1168 
1169 		if (new && new->ops == &noqueue_qdisc_ops) {
1170 			NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1171 			return -EINVAL;
1172 		}
1173 
1174 		if (new &&
1175 		    !(parent->flags & TCQ_F_MQROOT) &&
1176 		    rcu_access_pointer(new->stab)) {
1177 			NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
1178 			return -EINVAL;
1179 		}
1180 		err = cops->graft(parent, cl, new, &old, extack);
1181 		if (err)
1182 			return err;
1183 		notify_and_destroy(net, skb, n, classid, old, new, extack);
1184 	}
1185 	return 0;
1186 }
1187 
1188 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1189 				   struct netlink_ext_ack *extack)
1190 {
1191 	u32 block_index;
1192 
1193 	if (tca[TCA_INGRESS_BLOCK]) {
1194 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1195 
1196 		if (!block_index) {
1197 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1198 			return -EINVAL;
1199 		}
1200 		if (!sch->ops->ingress_block_set) {
1201 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1202 			return -EOPNOTSUPP;
1203 		}
1204 		sch->ops->ingress_block_set(sch, block_index);
1205 	}
1206 	if (tca[TCA_EGRESS_BLOCK]) {
1207 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1208 
1209 		if (!block_index) {
1210 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1211 			return -EINVAL;
1212 		}
1213 		if (!sch->ops->egress_block_set) {
1214 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1215 			return -EOPNOTSUPP;
1216 		}
1217 		sch->ops->egress_block_set(sch, block_index);
1218 	}
1219 	return 0;
1220 }
1221 
1222 /*
1223    Allocate and initialize new qdisc.
1224 
1225    Parameters are passed via opt.
1226  */
1227 
1228 static struct Qdisc *qdisc_create(struct net_device *dev,
1229 				  struct netdev_queue *dev_queue,
1230 				  u32 parent, u32 handle,
1231 				  struct nlattr **tca, int *errp,
1232 				  struct netlink_ext_ack *extack)
1233 {
1234 	int err;
1235 	struct nlattr *kind = tca[TCA_KIND];
1236 	struct Qdisc *sch;
1237 	struct Qdisc_ops *ops;
1238 	struct qdisc_size_table *stab;
1239 
1240 	ops = qdisc_lookup_ops(kind);
1241 #ifdef CONFIG_MODULES
1242 	if (ops == NULL && kind != NULL) {
1243 		char name[IFNAMSIZ];
1244 		if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1245 			/* We dropped the RTNL semaphore in order to
1246 			 * perform the module load.  So, even if we
1247 			 * succeeded in loading the module we have to
1248 			 * tell the caller to replay the request.  We
1249 			 * indicate this using -EAGAIN.
1250 			 * We replay the request because the device may
1251 			 * go away in the mean time.
1252 			 */
1253 			rtnl_unlock();
1254 			request_module("sch_%s", name);
1255 			rtnl_lock();
1256 			ops = qdisc_lookup_ops(kind);
1257 			if (ops != NULL) {
1258 				/* We will try again qdisc_lookup_ops,
1259 				 * so don't keep a reference.
1260 				 */
1261 				module_put(ops->owner);
1262 				err = -EAGAIN;
1263 				goto err_out;
1264 			}
1265 		}
1266 	}
1267 #endif
1268 
1269 	err = -ENOENT;
1270 	if (!ops) {
1271 		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1272 		goto err_out;
1273 	}
1274 
1275 	sch = qdisc_alloc(dev_queue, ops, extack);
1276 	if (IS_ERR(sch)) {
1277 		err = PTR_ERR(sch);
1278 		goto err_out2;
1279 	}
1280 
1281 	sch->parent = parent;
1282 
1283 	if (handle == TC_H_INGRESS) {
1284 		if (!(sch->flags & TCQ_F_INGRESS)) {
1285 			NL_SET_ERR_MSG(extack,
1286 				       "Specified parent ID is reserved for ingress and clsact Qdiscs");
1287 			err = -EINVAL;
1288 			goto err_out3;
1289 		}
1290 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1291 	} else {
1292 		if (handle == 0) {
1293 			handle = qdisc_alloc_handle(dev);
1294 			if (handle == 0) {
1295 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1296 				err = -ENOSPC;
1297 				goto err_out3;
1298 			}
1299 		}
1300 		if (!netif_is_multiqueue(dev))
1301 			sch->flags |= TCQ_F_ONETXQUEUE;
1302 	}
1303 
1304 	sch->handle = handle;
1305 
1306 	/* This exist to keep backward compatible with a userspace
1307 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1308 	 * facility on older kernels by setting tx_queue_len=0 (prior
1309 	 * to qdisc init), and then forgot to reinit tx_queue_len
1310 	 * before again attaching a qdisc.
1311 	 */
1312 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1313 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1314 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1315 	}
1316 
1317 	err = qdisc_block_indexes_set(sch, tca, extack);
1318 	if (err)
1319 		goto err_out3;
1320 
1321 	if (tca[TCA_STAB]) {
1322 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1323 		if (IS_ERR(stab)) {
1324 			err = PTR_ERR(stab);
1325 			goto err_out3;
1326 		}
1327 		rcu_assign_pointer(sch->stab, stab);
1328 	}
1329 
1330 	if (ops->init) {
1331 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1332 		if (err != 0)
1333 			goto err_out4;
1334 	}
1335 
1336 	if (tca[TCA_RATE]) {
1337 		err = -EOPNOTSUPP;
1338 		if (sch->flags & TCQ_F_MQROOT) {
1339 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1340 			goto err_out4;
1341 		}
1342 
1343 		err = gen_new_estimator(&sch->bstats,
1344 					sch->cpu_bstats,
1345 					&sch->rate_est,
1346 					NULL,
1347 					true,
1348 					tca[TCA_RATE]);
1349 		if (err) {
1350 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1351 			goto err_out4;
1352 		}
1353 	}
1354 
1355 	qdisc_hash_add(sch, false);
1356 	trace_qdisc_create(ops, dev, parent);
1357 
1358 	return sch;
1359 
1360 err_out4:
1361 	/* Even if ops->init() failed, we call ops->destroy()
1362 	 * like qdisc_create_dflt().
1363 	 */
1364 	if (ops->destroy)
1365 		ops->destroy(sch);
1366 	qdisc_put_stab(rtnl_dereference(sch->stab));
1367 err_out3:
1368 	lockdep_unregister_key(&sch->root_lock_key);
1369 	netdev_put(dev, &sch->dev_tracker);
1370 	qdisc_free(sch);
1371 err_out2:
1372 	module_put(ops->owner);
1373 err_out:
1374 	*errp = err;
1375 	return NULL;
1376 }
1377 
1378 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1379 			struct netlink_ext_ack *extack)
1380 {
1381 	struct qdisc_size_table *ostab, *stab = NULL;
1382 	int err = 0;
1383 
1384 	if (tca[TCA_OPTIONS]) {
1385 		if (!sch->ops->change) {
1386 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1387 			return -EINVAL;
1388 		}
1389 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1390 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1391 			return -EOPNOTSUPP;
1392 		}
1393 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1394 		if (err)
1395 			return err;
1396 	}
1397 
1398 	if (tca[TCA_STAB]) {
1399 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1400 		if (IS_ERR(stab))
1401 			return PTR_ERR(stab);
1402 	}
1403 
1404 	ostab = rtnl_dereference(sch->stab);
1405 	rcu_assign_pointer(sch->stab, stab);
1406 	qdisc_put_stab(ostab);
1407 
1408 	if (tca[TCA_RATE]) {
1409 		/* NB: ignores errors from replace_estimator
1410 		   because change can't be undone. */
1411 		if (sch->flags & TCQ_F_MQROOT)
1412 			goto out;
1413 		gen_replace_estimator(&sch->bstats,
1414 				      sch->cpu_bstats,
1415 				      &sch->rate_est,
1416 				      NULL,
1417 				      true,
1418 				      tca[TCA_RATE]);
1419 	}
1420 out:
1421 	return 0;
1422 }
1423 
1424 struct check_loop_arg {
1425 	struct qdisc_walker	w;
1426 	struct Qdisc		*p;
1427 	int			depth;
1428 };
1429 
1430 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1431 			 struct qdisc_walker *w);
1432 
1433 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1434 {
1435 	struct check_loop_arg	arg;
1436 
1437 	if (q->ops->cl_ops == NULL)
1438 		return 0;
1439 
1440 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1441 	arg.w.fn = check_loop_fn;
1442 	arg.depth = depth;
1443 	arg.p = p;
1444 	q->ops->cl_ops->walk(q, &arg.w);
1445 	return arg.w.stop ? -ELOOP : 0;
1446 }
1447 
1448 static int
1449 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1450 {
1451 	struct Qdisc *leaf;
1452 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1453 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1454 
1455 	leaf = cops->leaf(q, cl);
1456 	if (leaf) {
1457 		if (leaf == arg->p || arg->depth > 7)
1458 			return -ELOOP;
1459 		return check_loop(leaf, arg->p, arg->depth + 1);
1460 	}
1461 	return 0;
1462 }
1463 
1464 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1465 	[TCA_KIND]		= { .type = NLA_STRING },
1466 	[TCA_RATE]		= { .type = NLA_BINARY,
1467 				    .len = sizeof(struct tc_estimator) },
1468 	[TCA_STAB]		= { .type = NLA_NESTED },
1469 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1470 	[TCA_CHAIN]		= { .type = NLA_U32 },
1471 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1472 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1473 };
1474 
1475 /*
1476  * Delete/get qdisc.
1477  */
1478 
1479 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1480 			struct netlink_ext_ack *extack)
1481 {
1482 	struct net *net = sock_net(skb->sk);
1483 	struct tcmsg *tcm = nlmsg_data(n);
1484 	struct nlattr *tca[TCA_MAX + 1];
1485 	struct net_device *dev;
1486 	u32 clid;
1487 	struct Qdisc *q = NULL;
1488 	struct Qdisc *p = NULL;
1489 	int err;
1490 
1491 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1492 				     rtm_tca_policy, extack);
1493 	if (err < 0)
1494 		return err;
1495 
1496 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1497 	if (!dev)
1498 		return -ENODEV;
1499 
1500 	clid = tcm->tcm_parent;
1501 	if (clid) {
1502 		if (clid != TC_H_ROOT) {
1503 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1504 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1505 				if (!p) {
1506 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1507 					return -ENOENT;
1508 				}
1509 				q = qdisc_leaf(p, clid);
1510 			} else if (dev_ingress_queue(dev)) {
1511 				q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1512 			}
1513 		} else {
1514 			q = rtnl_dereference(dev->qdisc);
1515 		}
1516 		if (!q) {
1517 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1518 			return -ENOENT;
1519 		}
1520 
1521 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1522 			NL_SET_ERR_MSG(extack, "Invalid handle");
1523 			return -EINVAL;
1524 		}
1525 	} else {
1526 		q = qdisc_lookup(dev, tcm->tcm_handle);
1527 		if (!q) {
1528 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1529 			return -ENOENT;
1530 		}
1531 	}
1532 
1533 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1534 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1535 		return -EINVAL;
1536 	}
1537 
1538 	if (n->nlmsg_type == RTM_DELQDISC) {
1539 		if (!clid) {
1540 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1541 			return -EINVAL;
1542 		}
1543 		if (q->handle == 0) {
1544 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1545 			return -ENOENT;
1546 		}
1547 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1548 		if (err != 0)
1549 			return err;
1550 	} else {
1551 		qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1552 	}
1553 	return 0;
1554 }
1555 
1556 static bool req_create_or_replace(struct nlmsghdr *n)
1557 {
1558 	return (n->nlmsg_flags & NLM_F_CREATE &&
1559 		n->nlmsg_flags & NLM_F_REPLACE);
1560 }
1561 
1562 static bool req_create_exclusive(struct nlmsghdr *n)
1563 {
1564 	return (n->nlmsg_flags & NLM_F_CREATE &&
1565 		n->nlmsg_flags & NLM_F_EXCL);
1566 }
1567 
1568 static bool req_change(struct nlmsghdr *n)
1569 {
1570 	return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1571 		!(n->nlmsg_flags & NLM_F_REPLACE) &&
1572 		!(n->nlmsg_flags & NLM_F_EXCL));
1573 }
1574 
1575 /*
1576  * Create/change qdisc.
1577  */
1578 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1579 			   struct netlink_ext_ack *extack)
1580 {
1581 	struct net *net = sock_net(skb->sk);
1582 	struct tcmsg *tcm;
1583 	struct nlattr *tca[TCA_MAX + 1];
1584 	struct net_device *dev;
1585 	u32 clid;
1586 	struct Qdisc *q, *p;
1587 	int err;
1588 
1589 replay:
1590 	/* Reinit, just in case something touches this. */
1591 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1592 				     rtm_tca_policy, extack);
1593 	if (err < 0)
1594 		return err;
1595 
1596 	tcm = nlmsg_data(n);
1597 	clid = tcm->tcm_parent;
1598 	q = p = NULL;
1599 
1600 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1601 	if (!dev)
1602 		return -ENODEV;
1603 
1604 
1605 	if (clid) {
1606 		if (clid != TC_H_ROOT) {
1607 			if (clid != TC_H_INGRESS) {
1608 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1609 				if (!p) {
1610 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1611 					return -ENOENT;
1612 				}
1613 				q = qdisc_leaf(p, clid);
1614 			} else if (dev_ingress_queue_create(dev)) {
1615 				q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1616 			}
1617 		} else {
1618 			q = rtnl_dereference(dev->qdisc);
1619 		}
1620 
1621 		/* It may be default qdisc, ignore it */
1622 		if (q && q->handle == 0)
1623 			q = NULL;
1624 
1625 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1626 			if (tcm->tcm_handle) {
1627 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1628 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1629 					return -EEXIST;
1630 				}
1631 				if (TC_H_MIN(tcm->tcm_handle)) {
1632 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1633 					return -EINVAL;
1634 				}
1635 				q = qdisc_lookup(dev, tcm->tcm_handle);
1636 				if (!q)
1637 					goto create_n_graft;
1638 				if (n->nlmsg_flags & NLM_F_EXCL) {
1639 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1640 					return -EEXIST;
1641 				}
1642 				if (tca[TCA_KIND] &&
1643 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1644 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1645 					return -EINVAL;
1646 				}
1647 				if (q->flags & TCQ_F_INGRESS) {
1648 					NL_SET_ERR_MSG(extack,
1649 						       "Cannot regraft ingress or clsact Qdiscs");
1650 					return -EINVAL;
1651 				}
1652 				if (q == p ||
1653 				    (p && check_loop(q, p, 0))) {
1654 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1655 					return -ELOOP;
1656 				}
1657 				if (clid == TC_H_INGRESS) {
1658 					NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1659 					return -EINVAL;
1660 				}
1661 				qdisc_refcount_inc(q);
1662 				goto graft;
1663 			} else {
1664 				if (!q)
1665 					goto create_n_graft;
1666 
1667 				/* This magic test requires explanation.
1668 				 *
1669 				 *   We know, that some child q is already
1670 				 *   attached to this parent and have choice:
1671 				 *   1) change it or 2) create/graft new one.
1672 				 *   If the requested qdisc kind is different
1673 				 *   than the existing one, then we choose graft.
1674 				 *   If they are the same then this is "change"
1675 				 *   operation - just let it fallthrough..
1676 				 *
1677 				 *   1. We are allowed to create/graft only
1678 				 *   if the request is explicitly stating
1679 				 *   "please create if it doesn't exist".
1680 				 *
1681 				 *   2. If the request is to exclusive create
1682 				 *   then the qdisc tcm_handle is not expected
1683 				 *   to exist, so that we choose create/graft too.
1684 				 *
1685 				 *   3. The last case is when no flags are set.
1686 				 *   This will happen when for example tc
1687 				 *   utility issues a "change" command.
1688 				 *   Alas, it is sort of hole in API, we
1689 				 *   cannot decide what to do unambiguously.
1690 				 *   For now we select create/graft.
1691 				 */
1692 				if (tca[TCA_KIND] &&
1693 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1694 					if (req_create_or_replace(n) ||
1695 					    req_create_exclusive(n))
1696 						goto create_n_graft;
1697 					else if (req_change(n))
1698 						goto create_n_graft2;
1699 				}
1700 			}
1701 		}
1702 	} else {
1703 		if (!tcm->tcm_handle) {
1704 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1705 			return -EINVAL;
1706 		}
1707 		q = qdisc_lookup(dev, tcm->tcm_handle);
1708 	}
1709 
1710 	/* Change qdisc parameters */
1711 	if (!q) {
1712 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1713 		return -ENOENT;
1714 	}
1715 	if (n->nlmsg_flags & NLM_F_EXCL) {
1716 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1717 		return -EEXIST;
1718 	}
1719 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1720 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1721 		return -EINVAL;
1722 	}
1723 	err = qdisc_change(q, tca, extack);
1724 	if (err == 0)
1725 		qdisc_notify(net, skb, n, clid, NULL, q, extack);
1726 	return err;
1727 
1728 create_n_graft:
1729 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1730 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1731 		return -ENOENT;
1732 	}
1733 create_n_graft2:
1734 	if (clid == TC_H_INGRESS) {
1735 		if (dev_ingress_queue(dev)) {
1736 			q = qdisc_create(dev, dev_ingress_queue(dev),
1737 					 tcm->tcm_parent, tcm->tcm_parent,
1738 					 tca, &err, extack);
1739 		} else {
1740 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1741 			err = -ENOENT;
1742 		}
1743 	} else {
1744 		struct netdev_queue *dev_queue;
1745 
1746 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1747 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1748 		else if (p)
1749 			dev_queue = p->dev_queue;
1750 		else
1751 			dev_queue = netdev_get_tx_queue(dev, 0);
1752 
1753 		q = qdisc_create(dev, dev_queue,
1754 				 tcm->tcm_parent, tcm->tcm_handle,
1755 				 tca, &err, extack);
1756 	}
1757 	if (q == NULL) {
1758 		if (err == -EAGAIN)
1759 			goto replay;
1760 		return err;
1761 	}
1762 
1763 graft:
1764 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1765 	if (err) {
1766 		if (q)
1767 			qdisc_put(q);
1768 		return err;
1769 	}
1770 
1771 	return 0;
1772 }
1773 
1774 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1775 			      struct netlink_callback *cb,
1776 			      int *q_idx_p, int s_q_idx, bool recur,
1777 			      bool dump_invisible)
1778 {
1779 	int ret = 0, q_idx = *q_idx_p;
1780 	struct Qdisc *q;
1781 	int b;
1782 
1783 	if (!root)
1784 		return 0;
1785 
1786 	q = root;
1787 	if (q_idx < s_q_idx) {
1788 		q_idx++;
1789 	} else {
1790 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1791 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1792 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1793 				  RTM_NEWQDISC, NULL) <= 0)
1794 			goto done;
1795 		q_idx++;
1796 	}
1797 
1798 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1799 	 * itself has already been dumped.
1800 	 *
1801 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1802 	 * qdisc hashtable, we don't want to hit it again
1803 	 */
1804 	if (!qdisc_dev(root) || !recur)
1805 		goto out;
1806 
1807 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1808 		if (q_idx < s_q_idx) {
1809 			q_idx++;
1810 			continue;
1811 		}
1812 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1813 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1814 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1815 				  RTM_NEWQDISC, NULL) <= 0)
1816 			goto done;
1817 		q_idx++;
1818 	}
1819 
1820 out:
1821 	*q_idx_p = q_idx;
1822 	return ret;
1823 done:
1824 	ret = -1;
1825 	goto out;
1826 }
1827 
1828 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1829 {
1830 	struct net *net = sock_net(skb->sk);
1831 	int idx, q_idx;
1832 	int s_idx, s_q_idx;
1833 	struct net_device *dev;
1834 	const struct nlmsghdr *nlh = cb->nlh;
1835 	struct nlattr *tca[TCA_MAX + 1];
1836 	int err;
1837 
1838 	s_idx = cb->args[0];
1839 	s_q_idx = q_idx = cb->args[1];
1840 
1841 	idx = 0;
1842 	ASSERT_RTNL();
1843 
1844 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1845 				     rtm_tca_policy, cb->extack);
1846 	if (err < 0)
1847 		return err;
1848 
1849 	for_each_netdev(net, dev) {
1850 		struct netdev_queue *dev_queue;
1851 
1852 		if (idx < s_idx)
1853 			goto cont;
1854 		if (idx > s_idx)
1855 			s_q_idx = 0;
1856 		q_idx = 0;
1857 
1858 		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1859 				       skb, cb, &q_idx, s_q_idx,
1860 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1861 			goto done;
1862 
1863 		dev_queue = dev_ingress_queue(dev);
1864 		if (dev_queue &&
1865 		    tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1866 				       skb, cb, &q_idx, s_q_idx, false,
1867 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1868 			goto done;
1869 
1870 cont:
1871 		idx++;
1872 	}
1873 
1874 done:
1875 	cb->args[0] = idx;
1876 	cb->args[1] = q_idx;
1877 
1878 	return skb->len;
1879 }
1880 
1881 
1882 
1883 /************************************************
1884  *	Traffic classes manipulation.		*
1885  ************************************************/
1886 
1887 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1888 			  unsigned long cl, u32 portid, u32 seq, u16 flags,
1889 			  int event, struct netlink_ext_ack *extack)
1890 {
1891 	struct tcmsg *tcm;
1892 	struct nlmsghdr  *nlh;
1893 	unsigned char *b = skb_tail_pointer(skb);
1894 	struct gnet_dump d;
1895 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1896 
1897 	cond_resched();
1898 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1899 	if (!nlh)
1900 		goto out_nlmsg_trim;
1901 	tcm = nlmsg_data(nlh);
1902 	tcm->tcm_family = AF_UNSPEC;
1903 	tcm->tcm__pad1 = 0;
1904 	tcm->tcm__pad2 = 0;
1905 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1906 	tcm->tcm_parent = q->handle;
1907 	tcm->tcm_handle = q->handle;
1908 	tcm->tcm_info = 0;
1909 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1910 		goto nla_put_failure;
1911 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1912 		goto nla_put_failure;
1913 
1914 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1915 					 NULL, &d, TCA_PAD) < 0)
1916 		goto nla_put_failure;
1917 
1918 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1919 		goto nla_put_failure;
1920 
1921 	if (gnet_stats_finish_copy(&d) < 0)
1922 		goto nla_put_failure;
1923 
1924 	if (extack && extack->_msg &&
1925 	    nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1926 		goto out_nlmsg_trim;
1927 
1928 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1929 
1930 	return skb->len;
1931 
1932 out_nlmsg_trim:
1933 nla_put_failure:
1934 	nlmsg_trim(skb, b);
1935 	return -1;
1936 }
1937 
1938 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1939 			 struct nlmsghdr *n, struct Qdisc *q,
1940 			 unsigned long cl, int event, struct netlink_ext_ack *extack)
1941 {
1942 	struct sk_buff *skb;
1943 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1944 
1945 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1946 	if (!skb)
1947 		return -ENOBUFS;
1948 
1949 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1950 		kfree_skb(skb);
1951 		return -EINVAL;
1952 	}
1953 
1954 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1955 			      n->nlmsg_flags & NLM_F_ECHO);
1956 }
1957 
1958 static int tclass_del_notify(struct net *net,
1959 			     const struct Qdisc_class_ops *cops,
1960 			     struct sk_buff *oskb, struct nlmsghdr *n,
1961 			     struct Qdisc *q, unsigned long cl,
1962 			     struct netlink_ext_ack *extack)
1963 {
1964 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1965 	struct sk_buff *skb;
1966 	int err = 0;
1967 
1968 	if (!cops->delete)
1969 		return -EOPNOTSUPP;
1970 
1971 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1972 	if (!skb)
1973 		return -ENOBUFS;
1974 
1975 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1976 			   RTM_DELTCLASS, extack) < 0) {
1977 		kfree_skb(skb);
1978 		return -EINVAL;
1979 	}
1980 
1981 	err = cops->delete(q, cl, extack);
1982 	if (err) {
1983 		kfree_skb(skb);
1984 		return err;
1985 	}
1986 
1987 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1988 			     n->nlmsg_flags & NLM_F_ECHO);
1989 	return err;
1990 }
1991 
1992 #ifdef CONFIG_NET_CLS
1993 
1994 struct tcf_bind_args {
1995 	struct tcf_walker w;
1996 	unsigned long base;
1997 	unsigned long cl;
1998 	u32 classid;
1999 };
2000 
2001 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2002 {
2003 	struct tcf_bind_args *a = (void *)arg;
2004 
2005 	if (n && tp->ops->bind_class) {
2006 		struct Qdisc *q = tcf_block_q(tp->chain->block);
2007 
2008 		sch_tree_lock(q);
2009 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2010 		sch_tree_unlock(q);
2011 	}
2012 	return 0;
2013 }
2014 
2015 struct tc_bind_class_args {
2016 	struct qdisc_walker w;
2017 	unsigned long new_cl;
2018 	u32 portid;
2019 	u32 clid;
2020 };
2021 
2022 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2023 				struct qdisc_walker *w)
2024 {
2025 	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2026 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2027 	struct tcf_block *block;
2028 	struct tcf_chain *chain;
2029 
2030 	block = cops->tcf_block(q, cl, NULL);
2031 	if (!block)
2032 		return 0;
2033 	for (chain = tcf_get_next_chain(block, NULL);
2034 	     chain;
2035 	     chain = tcf_get_next_chain(block, chain)) {
2036 		struct tcf_proto *tp;
2037 
2038 		for (tp = tcf_get_next_proto(chain, NULL);
2039 		     tp; tp = tcf_get_next_proto(chain, tp)) {
2040 			struct tcf_bind_args arg = {};
2041 
2042 			arg.w.fn = tcf_node_bind;
2043 			arg.classid = a->clid;
2044 			arg.base = cl;
2045 			arg.cl = a->new_cl;
2046 			tp->ops->walk(tp, &arg.w, true);
2047 		}
2048 	}
2049 
2050 	return 0;
2051 }
2052 
2053 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2054 			   unsigned long new_cl)
2055 {
2056 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2057 	struct tc_bind_class_args args = {};
2058 
2059 	if (!cops->tcf_block)
2060 		return;
2061 	args.portid = portid;
2062 	args.clid = clid;
2063 	args.new_cl = new_cl;
2064 	args.w.fn = tc_bind_class_walker;
2065 	q->ops->cl_ops->walk(q, &args.w);
2066 }
2067 
2068 #else
2069 
2070 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2071 			   unsigned long new_cl)
2072 {
2073 }
2074 
2075 #endif
2076 
2077 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2078 			 struct netlink_ext_ack *extack)
2079 {
2080 	struct net *net = sock_net(skb->sk);
2081 	struct tcmsg *tcm = nlmsg_data(n);
2082 	struct nlattr *tca[TCA_MAX + 1];
2083 	struct net_device *dev;
2084 	struct Qdisc *q = NULL;
2085 	const struct Qdisc_class_ops *cops;
2086 	unsigned long cl = 0;
2087 	unsigned long new_cl;
2088 	u32 portid;
2089 	u32 clid;
2090 	u32 qid;
2091 	int err;
2092 
2093 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2094 				     rtm_tca_policy, extack);
2095 	if (err < 0)
2096 		return err;
2097 
2098 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2099 	if (!dev)
2100 		return -ENODEV;
2101 
2102 	/*
2103 	   parent == TC_H_UNSPEC - unspecified parent.
2104 	   parent == TC_H_ROOT   - class is root, which has no parent.
2105 	   parent == X:0	 - parent is root class.
2106 	   parent == X:Y	 - parent is a node in hierarchy.
2107 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2108 
2109 	   handle == 0:0	 - generate handle from kernel pool.
2110 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2111 	   handle == X:Y	 - clear.
2112 	   handle == X:0	 - root class.
2113 	 */
2114 
2115 	/* Step 1. Determine qdisc handle X:0 */
2116 
2117 	portid = tcm->tcm_parent;
2118 	clid = tcm->tcm_handle;
2119 	qid = TC_H_MAJ(clid);
2120 
2121 	if (portid != TC_H_ROOT) {
2122 		u32 qid1 = TC_H_MAJ(portid);
2123 
2124 		if (qid && qid1) {
2125 			/* If both majors are known, they must be identical. */
2126 			if (qid != qid1)
2127 				return -EINVAL;
2128 		} else if (qid1) {
2129 			qid = qid1;
2130 		} else if (qid == 0)
2131 			qid = rtnl_dereference(dev->qdisc)->handle;
2132 
2133 		/* Now qid is genuine qdisc handle consistent
2134 		 * both with parent and child.
2135 		 *
2136 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2137 		 */
2138 		if (portid)
2139 			portid = TC_H_MAKE(qid, portid);
2140 	} else {
2141 		if (qid == 0)
2142 			qid = rtnl_dereference(dev->qdisc)->handle;
2143 	}
2144 
2145 	/* OK. Locate qdisc */
2146 	q = qdisc_lookup(dev, qid);
2147 	if (!q)
2148 		return -ENOENT;
2149 
2150 	/* An check that it supports classes */
2151 	cops = q->ops->cl_ops;
2152 	if (cops == NULL)
2153 		return -EINVAL;
2154 
2155 	/* Now try to get class */
2156 	if (clid == 0) {
2157 		if (portid == TC_H_ROOT)
2158 			clid = qid;
2159 	} else
2160 		clid = TC_H_MAKE(qid, clid);
2161 
2162 	if (clid)
2163 		cl = cops->find(q, clid);
2164 
2165 	if (cl == 0) {
2166 		err = -ENOENT;
2167 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2168 		    !(n->nlmsg_flags & NLM_F_CREATE))
2169 			goto out;
2170 	} else {
2171 		switch (n->nlmsg_type) {
2172 		case RTM_NEWTCLASS:
2173 			err = -EEXIST;
2174 			if (n->nlmsg_flags & NLM_F_EXCL)
2175 				goto out;
2176 			break;
2177 		case RTM_DELTCLASS:
2178 			err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2179 			/* Unbind the class with flilters with 0 */
2180 			tc_bind_tclass(q, portid, clid, 0);
2181 			goto out;
2182 		case RTM_GETTCLASS:
2183 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2184 			goto out;
2185 		default:
2186 			err = -EINVAL;
2187 			goto out;
2188 		}
2189 	}
2190 
2191 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2192 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2193 		return -EOPNOTSUPP;
2194 	}
2195 
2196 	new_cl = cl;
2197 	err = -EOPNOTSUPP;
2198 	if (cops->change)
2199 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2200 	if (err == 0) {
2201 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2202 		/* We just create a new class, need to do reverse binding. */
2203 		if (cl != new_cl)
2204 			tc_bind_tclass(q, portid, clid, new_cl);
2205 	}
2206 out:
2207 	return err;
2208 }
2209 
2210 struct qdisc_dump_args {
2211 	struct qdisc_walker	w;
2212 	struct sk_buff		*skb;
2213 	struct netlink_callback	*cb;
2214 };
2215 
2216 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2217 			    struct qdisc_walker *arg)
2218 {
2219 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2220 
2221 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2222 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2223 			      RTM_NEWTCLASS, NULL);
2224 }
2225 
2226 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2227 				struct tcmsg *tcm, struct netlink_callback *cb,
2228 				int *t_p, int s_t)
2229 {
2230 	struct qdisc_dump_args arg;
2231 
2232 	if (tc_qdisc_dump_ignore(q, false) ||
2233 	    *t_p < s_t || !q->ops->cl_ops ||
2234 	    (tcm->tcm_parent &&
2235 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2236 		(*t_p)++;
2237 		return 0;
2238 	}
2239 	if (*t_p > s_t)
2240 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2241 	arg.w.fn = qdisc_class_dump;
2242 	arg.skb = skb;
2243 	arg.cb = cb;
2244 	arg.w.stop  = 0;
2245 	arg.w.skip = cb->args[1];
2246 	arg.w.count = 0;
2247 	q->ops->cl_ops->walk(q, &arg.w);
2248 	cb->args[1] = arg.w.count;
2249 	if (arg.w.stop)
2250 		return -1;
2251 	(*t_p)++;
2252 	return 0;
2253 }
2254 
2255 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2256 			       struct tcmsg *tcm, struct netlink_callback *cb,
2257 			       int *t_p, int s_t, bool recur)
2258 {
2259 	struct Qdisc *q;
2260 	int b;
2261 
2262 	if (!root)
2263 		return 0;
2264 
2265 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2266 		return -1;
2267 
2268 	if (!qdisc_dev(root) || !recur)
2269 		return 0;
2270 
2271 	if (tcm->tcm_parent) {
2272 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2273 		if (q && q != root &&
2274 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2275 			return -1;
2276 		return 0;
2277 	}
2278 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2279 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2280 			return -1;
2281 	}
2282 
2283 	return 0;
2284 }
2285 
2286 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2287 {
2288 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2289 	struct net *net = sock_net(skb->sk);
2290 	struct netdev_queue *dev_queue;
2291 	struct net_device *dev;
2292 	int t, s_t;
2293 
2294 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2295 		return 0;
2296 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2297 	if (!dev)
2298 		return 0;
2299 
2300 	s_t = cb->args[0];
2301 	t = 0;
2302 
2303 	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2304 				skb, tcm, cb, &t, s_t, true) < 0)
2305 		goto done;
2306 
2307 	dev_queue = dev_ingress_queue(dev);
2308 	if (dev_queue &&
2309 	    tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2310 				skb, tcm, cb, &t, s_t, false) < 0)
2311 		goto done;
2312 
2313 done:
2314 	cb->args[0] = t;
2315 
2316 	dev_put(dev);
2317 	return skb->len;
2318 }
2319 
2320 #ifdef CONFIG_PROC_FS
2321 static int psched_show(struct seq_file *seq, void *v)
2322 {
2323 	seq_printf(seq, "%08x %08x %08x %08x\n",
2324 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2325 		   1000000,
2326 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2327 
2328 	return 0;
2329 }
2330 
2331 static int __net_init psched_net_init(struct net *net)
2332 {
2333 	struct proc_dir_entry *e;
2334 
2335 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2336 	if (e == NULL)
2337 		return -ENOMEM;
2338 
2339 	return 0;
2340 }
2341 
2342 static void __net_exit psched_net_exit(struct net *net)
2343 {
2344 	remove_proc_entry("psched", net->proc_net);
2345 }
2346 #else
2347 static int __net_init psched_net_init(struct net *net)
2348 {
2349 	return 0;
2350 }
2351 
2352 static void __net_exit psched_net_exit(struct net *net)
2353 {
2354 }
2355 #endif
2356 
2357 static struct pernet_operations psched_net_ops = {
2358 	.init = psched_net_init,
2359 	.exit = psched_net_exit,
2360 };
2361 
2362 #if IS_ENABLED(CONFIG_RETPOLINE)
2363 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2364 #endif
2365 
2366 static int __init pktsched_init(void)
2367 {
2368 	int err;
2369 
2370 	err = register_pernet_subsys(&psched_net_ops);
2371 	if (err) {
2372 		pr_err("pktsched_init: "
2373 		       "cannot initialize per netns operations\n");
2374 		return err;
2375 	}
2376 
2377 	register_qdisc(&pfifo_fast_ops);
2378 	register_qdisc(&pfifo_qdisc_ops);
2379 	register_qdisc(&bfifo_qdisc_ops);
2380 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2381 	register_qdisc(&mq_qdisc_ops);
2382 	register_qdisc(&noqueue_qdisc_ops);
2383 
2384 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2385 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2386 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2387 		      0);
2388 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2389 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2390 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2391 		      0);
2392 
2393 	tc_wrapper_init();
2394 
2395 	return 0;
2396 }
2397 
2398 subsys_initcall(pktsched_init);
2399