1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * net/sched/sch_api.c Packet scheduler API.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 *
7 * Fixes:
8 *
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12 */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40 Short review.
41 -------------
42
43 This file consists of two interrelated parts:
44
45 1. queueing disciplines manager frontend.
46 2. traffic classes manager frontend.
47
48 Generally, queueing discipline ("qdisc") is a black box,
49 which is able to enqueue packets and to dequeue them (when
50 device is ready to send something) in order and at times
51 determined by algorithm hidden in it.
52
53 qdisc's are divided to two categories:
54 - "queues", which have no internal structure visible from outside.
55 - "schedulers", which split all the packets to "traffic classes",
56 using "packet classifiers" (look at cls_api.c)
57
58 In turn, classes may have child qdiscs (as rule, queues)
59 attached to them etc. etc. etc.
60
61 The goal of the routines in this file is to translate
62 information supplied by user in the form of handles
63 to more intelligible for kernel form, to make some sanity
64 checks and part of work, which is common to all qdiscs
65 and to provide rtnetlink notifications.
66
67 All real intelligent work is done inside qdisc modules.
68
69
70
71 Every discipline has two major routines: enqueue and dequeue.
72
73 ---dequeue
74
75 dequeue usually returns a skb to send. It is allowed to return NULL,
76 but it does not mean that queue is empty, it just means that
77 discipline does not want to send anything this time.
78 Queue is really empty if q->q.qlen == 0.
79 For complicated disciplines with multiple queues q->q is not
80 real packet queue, but however q->q.qlen must be valid.
81
82 ---enqueue
83
84 enqueue returns 0, if packet was enqueued successfully.
85 If packet (this one or another one) was dropped, it returns
86 not zero error code.
87 NET_XMIT_DROP - this packet dropped
88 Expected action: do not backoff, but wait until queue will clear.
89 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
90 Expected action: backoff or ignore
91
92 Auxiliary routines:
93
94 ---peek
95
96 like dequeue but without removing a packet from the queue
97
98 ---reset
99
100 returns qdisc to initial state: purge all buffers, clear all
101 timers, counters (except for statistics) etc.
102
103 ---init
104
105 initializes newly created qdisc.
106
107 ---destroy
108
109 destroys resources allocated by init and during lifetime of qdisc.
110
111 ---change
112
113 changes qdisc parameters.
114 */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121 * Queueing disciplines manipulation. *
122 ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
register_qdisc(struct Qdisc_ops * qops)131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133 struct Qdisc_ops *q, **qp;
134 int rc = -EEXIST;
135
136 write_lock(&qdisc_mod_lock);
137 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138 if (!strcmp(qops->id, q->id))
139 goto out;
140
141 if (qops->enqueue == NULL)
142 qops->enqueue = noop_qdisc_ops.enqueue;
143 if (qops->peek == NULL) {
144 if (qops->dequeue == NULL)
145 qops->peek = noop_qdisc_ops.peek;
146 else
147 goto out_einval;
148 }
149 if (qops->dequeue == NULL)
150 qops->dequeue = noop_qdisc_ops.dequeue;
151
152 if (qops->cl_ops) {
153 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155 if (!(cops->find && cops->walk && cops->leaf))
156 goto out_einval;
157
158 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159 goto out_einval;
160 }
161
162 qops->next = NULL;
163 *qp = qops;
164 rc = 0;
165 out:
166 write_unlock(&qdisc_mod_lock);
167 return rc;
168
169 out_einval:
170 rc = -EINVAL;
171 goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
unregister_qdisc(struct Qdisc_ops * qops)175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177 struct Qdisc_ops *q, **qp;
178 int err = -ENOENT;
179
180 write_lock(&qdisc_mod_lock);
181 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182 if (q == qops)
183 break;
184 if (q) {
185 *qp = q->next;
186 q->next = NULL;
187 err = 0;
188 }
189 write_unlock(&qdisc_mod_lock);
190
191 WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)196 void qdisc_get_default(char *name, size_t len)
197 {
198 read_lock(&qdisc_mod_lock);
199 strscpy(name, default_qdisc_ops->id, len);
200 read_unlock(&qdisc_mod_lock);
201 }
202
qdisc_lookup_default(const char * name)203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205 struct Qdisc_ops *q = NULL;
206
207 for (q = qdisc_base; q; q = q->next) {
208 if (!strcmp(name, q->id)) {
209 if (!try_module_get(q->owner))
210 q = NULL;
211 break;
212 }
213 }
214
215 return q;
216 }
217
218 /* Set new default qdisc to use */
qdisc_set_default(const char * name)219 int qdisc_set_default(const char *name)
220 {
221 const struct Qdisc_ops *ops;
222
223 if (!capable(CAP_NET_ADMIN))
224 return -EPERM;
225
226 write_lock(&qdisc_mod_lock);
227 ops = qdisc_lookup_default(name);
228 if (!ops) {
229 /* Not found, drop lock and try to load module */
230 write_unlock(&qdisc_mod_lock);
231 request_module("sch_%s", name);
232 write_lock(&qdisc_mod_lock);
233
234 ops = qdisc_lookup_default(name);
235 }
236
237 if (ops) {
238 /* Set new default */
239 module_put(default_qdisc_ops->owner);
240 default_qdisc_ops = ops;
241 }
242 write_unlock(&qdisc_mod_lock);
243
244 return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
sch_default_qdisc(void)249 static int __init sch_default_qdisc(void)
250 {
251 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257 * (root qdisc, all its children, children of children etc.)
258 * Note: caller either uses rtnl or rcu_read_lock()
259 */
260
qdisc_match_from_root(struct Qdisc * root,u32 handle)261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263 struct Qdisc *q;
264
265 if (!qdisc_dev(root))
266 return (root->handle == handle ? root : NULL);
267
268 if (!(root->flags & TCQ_F_BUILTIN) &&
269 root->handle == handle)
270 return root;
271
272 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273 lockdep_rtnl_is_held()) {
274 if (q->handle == handle)
275 return q;
276 }
277 return NULL;
278 }
279
qdisc_hash_add(struct Qdisc * q,bool invisible)280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 ASSERT_RTNL();
284 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 if (invisible)
286 q->flags |= TCQ_F_INVISIBLE;
287 }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
qdisc_hash_del(struct Qdisc * q)291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 ASSERT_RTNL();
295 hash_del_rcu(&q->hash);
296 }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
qdisc_lookup(struct net_device * dev,u32 handle)300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 struct Qdisc *q;
303
304 if (!handle)
305 return NULL;
306 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307 if (q)
308 goto out;
309
310 if (dev_ingress_queue(dev))
311 q = qdisc_match_from_root(
312 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313 handle);
314 out:
315 return q;
316 }
317
qdisc_lookup_rcu(struct net_device * dev,u32 handle)318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320 struct netdev_queue *nq;
321 struct Qdisc *q;
322
323 if (!handle)
324 return NULL;
325 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326 if (q)
327 goto out;
328
329 nq = dev_ingress_queue_rcu(dev);
330 if (nq)
331 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332 handle);
333 out:
334 return q;
335 }
336
qdisc_leaf(struct Qdisc * p,u32 classid,struct netlink_ext_ack * extack)337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid,
338 struct netlink_ext_ack *extack)
339 {
340 unsigned long cl;
341 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
342
343 if (cops == NULL) {
344 NL_SET_ERR_MSG(extack, "Parent qdisc is not classful");
345 return ERR_PTR(-EOPNOTSUPP);
346 }
347 cl = cops->find(p, classid);
348
349 if (cl == 0) {
350 NL_SET_ERR_MSG(extack, "Specified class not found");
351 return ERR_PTR(-ENOENT);
352 }
353 return cops->leaf(p, cl);
354 }
355
356 /* Find queueing discipline by name */
357
qdisc_lookup_ops(struct nlattr * kind)358 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
359 {
360 struct Qdisc_ops *q = NULL;
361
362 if (kind) {
363 read_lock(&qdisc_mod_lock);
364 for (q = qdisc_base; q; q = q->next) {
365 if (nla_strcmp(kind, q->id) == 0) {
366 if (!try_module_get(q->owner))
367 q = NULL;
368 break;
369 }
370 }
371 read_unlock(&qdisc_mod_lock);
372 }
373 return q;
374 }
375
376 /* The linklayer setting were not transferred from iproute2, in older
377 * versions, and the rate tables lookup systems have been dropped in
378 * the kernel. To keep backward compatible with older iproute2 tc
379 * utils, we detect the linklayer setting by detecting if the rate
380 * table were modified.
381 *
382 * For linklayer ATM table entries, the rate table will be aligned to
383 * 48 bytes, thus some table entries will contain the same value. The
384 * mpu (min packet unit) is also encoded into the old rate table, thus
385 * starting from the mpu, we find low and high table entries for
386 * mapping this cell. If these entries contain the same value, when
387 * the rate tables have been modified for linklayer ATM.
388 *
389 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
390 * and then roundup to the next cell, calc the table entry one below,
391 * and compare.
392 */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)393 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
394 {
395 int low = roundup(r->mpu, 48);
396 int high = roundup(low+1, 48);
397 int cell_low = low >> r->cell_log;
398 int cell_high = (high >> r->cell_log) - 1;
399
400 /* rtab is too inaccurate at rates > 100Mbit/s */
401 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
402 pr_debug("TC linklayer: Giving up ATM detection\n");
403 return TC_LINKLAYER_ETHERNET;
404 }
405
406 if ((cell_high > cell_low) && (cell_high < 256)
407 && (rtab[cell_low] == rtab[cell_high])) {
408 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
409 cell_low, cell_high, rtab[cell_high]);
410 return TC_LINKLAYER_ATM;
411 }
412 return TC_LINKLAYER_ETHERNET;
413 }
414
415 static struct qdisc_rate_table *qdisc_rtab_list;
416
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)417 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
418 struct nlattr *tab,
419 struct netlink_ext_ack *extack)
420 {
421 struct qdisc_rate_table *rtab;
422
423 if (tab == NULL || r->rate == 0 ||
424 r->cell_log == 0 || r->cell_log >= 32 ||
425 nla_len(tab) != TC_RTAB_SIZE) {
426 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
427 return NULL;
428 }
429
430 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
431 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
432 !memcmp(&rtab->data, nla_data(tab), 1024)) {
433 rtab->refcnt++;
434 return rtab;
435 }
436 }
437
438 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
439 if (rtab) {
440 rtab->rate = *r;
441 rtab->refcnt = 1;
442 memcpy(rtab->data, nla_data(tab), 1024);
443 if (r->linklayer == TC_LINKLAYER_UNAWARE)
444 r->linklayer = __detect_linklayer(r, rtab->data);
445 rtab->next = qdisc_rtab_list;
446 qdisc_rtab_list = rtab;
447 } else {
448 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
449 }
450 return rtab;
451 }
452 EXPORT_SYMBOL(qdisc_get_rtab);
453
qdisc_put_rtab(struct qdisc_rate_table * tab)454 void qdisc_put_rtab(struct qdisc_rate_table *tab)
455 {
456 struct qdisc_rate_table *rtab, **rtabp;
457
458 if (!tab || --tab->refcnt)
459 return;
460
461 for (rtabp = &qdisc_rtab_list;
462 (rtab = *rtabp) != NULL;
463 rtabp = &rtab->next) {
464 if (rtab == tab) {
465 *rtabp = rtab->next;
466 kfree(rtab);
467 return;
468 }
469 }
470 }
471 EXPORT_SYMBOL(qdisc_put_rtab);
472
473 static LIST_HEAD(qdisc_stab_list);
474
475 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
476 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
477 [TCA_STAB_DATA] = { .type = NLA_BINARY },
478 };
479
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)480 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
481 struct netlink_ext_ack *extack)
482 {
483 struct nlattr *tb[TCA_STAB_MAX + 1];
484 struct qdisc_size_table *stab;
485 struct tc_sizespec *s;
486 unsigned int tsize = 0;
487 u16 *tab = NULL;
488 int err;
489
490 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
491 extack);
492 if (err < 0)
493 return ERR_PTR(err);
494 if (!tb[TCA_STAB_BASE]) {
495 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
496 return ERR_PTR(-EINVAL);
497 }
498
499 s = nla_data(tb[TCA_STAB_BASE]);
500
501 if (s->tsize > 0) {
502 if (!tb[TCA_STAB_DATA]) {
503 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
504 return ERR_PTR(-EINVAL);
505 }
506 tab = nla_data(tb[TCA_STAB_DATA]);
507 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
508 }
509
510 if (tsize != s->tsize || (!tab && tsize > 0)) {
511 NL_SET_ERR_MSG(extack, "Invalid size of size table");
512 return ERR_PTR(-EINVAL);
513 }
514
515 list_for_each_entry(stab, &qdisc_stab_list, list) {
516 if (memcmp(&stab->szopts, s, sizeof(*s)))
517 continue;
518 if (tsize > 0 &&
519 memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
520 continue;
521 stab->refcnt++;
522 return stab;
523 }
524
525 if (s->size_log > STAB_SIZE_LOG_MAX ||
526 s->cell_log > STAB_SIZE_LOG_MAX) {
527 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
528 return ERR_PTR(-EINVAL);
529 }
530
531 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
532 if (!stab)
533 return ERR_PTR(-ENOMEM);
534
535 stab->refcnt = 1;
536 stab->szopts = *s;
537 if (tsize > 0)
538 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
539
540 list_add_tail(&stab->list, &qdisc_stab_list);
541
542 return stab;
543 }
544
qdisc_put_stab(struct qdisc_size_table * tab)545 void qdisc_put_stab(struct qdisc_size_table *tab)
546 {
547 if (!tab)
548 return;
549
550 if (--tab->refcnt == 0) {
551 list_del(&tab->list);
552 kfree_rcu(tab, rcu);
553 }
554 }
555 EXPORT_SYMBOL(qdisc_put_stab);
556
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)557 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
558 {
559 struct nlattr *nest;
560
561 nest = nla_nest_start_noflag(skb, TCA_STAB);
562 if (nest == NULL)
563 goto nla_put_failure;
564 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
565 goto nla_put_failure;
566 nla_nest_end(skb, nest);
567
568 return skb->len;
569
570 nla_put_failure:
571 return -1;
572 }
573
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)574 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
575 const struct qdisc_size_table *stab)
576 {
577 int pkt_len, slot;
578
579 pkt_len = skb->len + stab->szopts.overhead;
580 if (unlikely(!stab->szopts.tsize))
581 goto out;
582
583 slot = pkt_len + stab->szopts.cell_align;
584 if (unlikely(slot < 0))
585 slot = 0;
586
587 slot >>= stab->szopts.cell_log;
588 if (likely(slot < stab->szopts.tsize))
589 pkt_len = stab->data[slot];
590 else
591 pkt_len = stab->data[stab->szopts.tsize - 1] *
592 (slot / stab->szopts.tsize) +
593 stab->data[slot % stab->szopts.tsize];
594
595 pkt_len <<= stab->szopts.size_log;
596 out:
597 if (unlikely(pkt_len < 1))
598 pkt_len = 1;
599 qdisc_skb_cb(skb)->pkt_len = pkt_len;
600 }
601
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)602 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
603 {
604 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
605 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
606 txt, qdisc->ops->id, qdisc->handle >> 16);
607 qdisc->flags |= TCQ_F_WARN_NONWC;
608 }
609 }
610 EXPORT_SYMBOL(qdisc_warn_nonwc);
611
qdisc_watchdog(struct hrtimer * timer)612 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
613 {
614 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
615 timer);
616
617 rcu_read_lock();
618 __netif_schedule(qdisc_root(wd->qdisc));
619 rcu_read_unlock();
620
621 return HRTIMER_NORESTART;
622 }
623
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)624 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
625 clockid_t clockid)
626 {
627 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
628 wd->timer.function = qdisc_watchdog;
629 wd->qdisc = qdisc;
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
632
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)633 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
634 {
635 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
636 }
637 EXPORT_SYMBOL(qdisc_watchdog_init);
638
qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog * wd,u64 expires,u64 delta_ns)639 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
640 u64 delta_ns)
641 {
642 bool deactivated;
643
644 rcu_read_lock();
645 deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
646 &qdisc_root_sleeping(wd->qdisc)->state);
647 rcu_read_unlock();
648 if (deactivated)
649 return;
650
651 if (hrtimer_is_queued(&wd->timer)) {
652 u64 softexpires;
653
654 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
655 /* If timer is already set in [expires, expires + delta_ns],
656 * do not reprogram it.
657 */
658 if (softexpires - expires <= delta_ns)
659 return;
660 }
661
662 hrtimer_start_range_ns(&wd->timer,
663 ns_to_ktime(expires),
664 delta_ns,
665 HRTIMER_MODE_ABS_PINNED);
666 }
667 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
668
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)669 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
670 {
671 hrtimer_cancel(&wd->timer);
672 }
673 EXPORT_SYMBOL(qdisc_watchdog_cancel);
674
qdisc_class_hash_alloc(unsigned int n)675 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
676 {
677 struct hlist_head *h;
678 unsigned int i;
679
680 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
681
682 if (h != NULL) {
683 for (i = 0; i < n; i++)
684 INIT_HLIST_HEAD(&h[i]);
685 }
686 return h;
687 }
688
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)689 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
690 {
691 struct Qdisc_class_common *cl;
692 struct hlist_node *next;
693 struct hlist_head *nhash, *ohash;
694 unsigned int nsize, nmask, osize;
695 unsigned int i, h;
696
697 /* Rehash when load factor exceeds 0.75 */
698 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
699 return;
700 nsize = clhash->hashsize * 2;
701 nmask = nsize - 1;
702 nhash = qdisc_class_hash_alloc(nsize);
703 if (nhash == NULL)
704 return;
705
706 ohash = clhash->hash;
707 osize = clhash->hashsize;
708
709 sch_tree_lock(sch);
710 for (i = 0; i < osize; i++) {
711 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
712 h = qdisc_class_hash(cl->classid, nmask);
713 hlist_add_head(&cl->hnode, &nhash[h]);
714 }
715 }
716 clhash->hash = nhash;
717 clhash->hashsize = nsize;
718 clhash->hashmask = nmask;
719 sch_tree_unlock(sch);
720
721 kvfree(ohash);
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_grow);
724
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)725 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
726 {
727 unsigned int size = 4;
728
729 clhash->hash = qdisc_class_hash_alloc(size);
730 if (!clhash->hash)
731 return -ENOMEM;
732 clhash->hashsize = size;
733 clhash->hashmask = size - 1;
734 clhash->hashelems = 0;
735 return 0;
736 }
737 EXPORT_SYMBOL(qdisc_class_hash_init);
738
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)739 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
740 {
741 kvfree(clhash->hash);
742 }
743 EXPORT_SYMBOL(qdisc_class_hash_destroy);
744
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)745 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
746 struct Qdisc_class_common *cl)
747 {
748 unsigned int h;
749
750 INIT_HLIST_NODE(&cl->hnode);
751 h = qdisc_class_hash(cl->classid, clhash->hashmask);
752 hlist_add_head(&cl->hnode, &clhash->hash[h]);
753 clhash->hashelems++;
754 }
755 EXPORT_SYMBOL(qdisc_class_hash_insert);
756
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)757 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
758 struct Qdisc_class_common *cl)
759 {
760 hlist_del(&cl->hnode);
761 clhash->hashelems--;
762 }
763 EXPORT_SYMBOL(qdisc_class_hash_remove);
764
765 /* Allocate an unique handle from space managed by kernel
766 * Possible range is [8000-FFFF]:0000 (0x8000 values)
767 */
qdisc_alloc_handle(struct net_device * dev)768 static u32 qdisc_alloc_handle(struct net_device *dev)
769 {
770 int i = 0x8000;
771 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
772
773 do {
774 autohandle += TC_H_MAKE(0x10000U, 0);
775 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
776 autohandle = TC_H_MAKE(0x80000000U, 0);
777 if (!qdisc_lookup(dev, autohandle))
778 return autohandle;
779 cond_resched();
780 } while (--i > 0);
781
782 return 0;
783 }
784
qdisc_tree_reduce_backlog(struct Qdisc * sch,int n,int len)785 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
786 {
787 const struct Qdisc_class_ops *cops;
788 unsigned long cl;
789 u32 parentid;
790 bool notify;
791 int drops;
792
793 drops = max_t(int, n, 0);
794 rcu_read_lock();
795 while ((parentid = sch->parent)) {
796 if (parentid == TC_H_ROOT)
797 break;
798
799 if (sch->flags & TCQ_F_NOPARENT)
800 break;
801 /* Notify parent qdisc only if child qdisc becomes empty. */
802 notify = !sch->q.qlen;
803 /* TODO: perform the search on a per txq basis */
804 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
805 if (sch == NULL) {
806 WARN_ON_ONCE(parentid != TC_H_ROOT);
807 break;
808 }
809 cops = sch->ops->cl_ops;
810 if (notify && cops->qlen_notify) {
811 /* Note that qlen_notify must be idempotent as it may get called
812 * multiple times.
813 */
814 cl = cops->find(sch, parentid);
815 cops->qlen_notify(sch, cl);
816 }
817 sch->q.qlen -= n;
818 sch->qstats.backlog -= len;
819 __qdisc_qstats_drop(sch, drops);
820 }
821 rcu_read_unlock();
822 }
823 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
824
qdisc_offload_dump_helper(struct Qdisc * sch,enum tc_setup_type type,void * type_data)825 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
826 void *type_data)
827 {
828 struct net_device *dev = qdisc_dev(sch);
829 int err;
830
831 sch->flags &= ~TCQ_F_OFFLOADED;
832 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
833 return 0;
834
835 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
836 if (err == -EOPNOTSUPP)
837 return 0;
838
839 if (!err)
840 sch->flags |= TCQ_F_OFFLOADED;
841
842 return err;
843 }
844 EXPORT_SYMBOL(qdisc_offload_dump_helper);
845
qdisc_offload_graft_helper(struct net_device * dev,struct Qdisc * sch,struct Qdisc * new,struct Qdisc * old,enum tc_setup_type type,void * type_data,struct netlink_ext_ack * extack)846 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
847 struct Qdisc *new, struct Qdisc *old,
848 enum tc_setup_type type, void *type_data,
849 struct netlink_ext_ack *extack)
850 {
851 bool any_qdisc_is_offloaded;
852 int err;
853
854 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
855 return;
856
857 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
858
859 /* Don't report error if the graft is part of destroy operation. */
860 if (!err || !new || new == &noop_qdisc)
861 return;
862
863 /* Don't report error if the parent, the old child and the new
864 * one are not offloaded.
865 */
866 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
867 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
868 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
869
870 if (any_qdisc_is_offloaded)
871 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
872 }
873 EXPORT_SYMBOL(qdisc_offload_graft_helper);
874
qdisc_offload_query_caps(struct net_device * dev,enum tc_setup_type type,void * caps,size_t caps_len)875 void qdisc_offload_query_caps(struct net_device *dev,
876 enum tc_setup_type type,
877 void *caps, size_t caps_len)
878 {
879 const struct net_device_ops *ops = dev->netdev_ops;
880 struct tc_query_caps_base base = {
881 .type = type,
882 .caps = caps,
883 };
884
885 memset(caps, 0, caps_len);
886
887 if (ops->ndo_setup_tc)
888 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
889 }
890 EXPORT_SYMBOL(qdisc_offload_query_caps);
891
qdisc_offload_graft_root(struct net_device * dev,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)892 static void qdisc_offload_graft_root(struct net_device *dev,
893 struct Qdisc *new, struct Qdisc *old,
894 struct netlink_ext_ack *extack)
895 {
896 struct tc_root_qopt_offload graft_offload = {
897 .command = TC_ROOT_GRAFT,
898 .handle = new ? new->handle : 0,
899 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
900 (old && old->flags & TCQ_F_INGRESS),
901 };
902
903 qdisc_offload_graft_helper(dev, NULL, new, old,
904 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
905 }
906
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event,struct netlink_ext_ack * extack)907 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
908 u32 portid, u32 seq, u16 flags, int event,
909 struct netlink_ext_ack *extack)
910 {
911 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
912 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
913 struct tcmsg *tcm;
914 struct nlmsghdr *nlh;
915 unsigned char *b = skb_tail_pointer(skb);
916 struct gnet_dump d;
917 struct qdisc_size_table *stab;
918 u32 block_index;
919 __u32 qlen;
920
921 cond_resched();
922 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
923 if (!nlh)
924 goto out_nlmsg_trim;
925 tcm = nlmsg_data(nlh);
926 tcm->tcm_family = AF_UNSPEC;
927 tcm->tcm__pad1 = 0;
928 tcm->tcm__pad2 = 0;
929 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
930 tcm->tcm_parent = clid;
931 tcm->tcm_handle = q->handle;
932 tcm->tcm_info = refcount_read(&q->refcnt);
933 if (nla_put_string(skb, TCA_KIND, q->ops->id))
934 goto nla_put_failure;
935 if (q->ops->ingress_block_get) {
936 block_index = q->ops->ingress_block_get(q);
937 if (block_index &&
938 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
939 goto nla_put_failure;
940 }
941 if (q->ops->egress_block_get) {
942 block_index = q->ops->egress_block_get(q);
943 if (block_index &&
944 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
945 goto nla_put_failure;
946 }
947 if (q->ops->dump && q->ops->dump(q, skb) < 0)
948 goto nla_put_failure;
949 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
950 goto nla_put_failure;
951 qlen = qdisc_qlen_sum(q);
952
953 stab = rtnl_dereference(q->stab);
954 if (stab && qdisc_dump_stab(skb, stab) < 0)
955 goto nla_put_failure;
956
957 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
958 NULL, &d, TCA_PAD) < 0)
959 goto nla_put_failure;
960
961 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
962 goto nla_put_failure;
963
964 if (qdisc_is_percpu_stats(q)) {
965 cpu_bstats = q->cpu_bstats;
966 cpu_qstats = q->cpu_qstats;
967 }
968
969 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
970 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
971 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
972 goto nla_put_failure;
973
974 if (gnet_stats_finish_copy(&d) < 0)
975 goto nla_put_failure;
976
977 if (extack && extack->_msg &&
978 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
979 goto out_nlmsg_trim;
980
981 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
982
983 return skb->len;
984
985 out_nlmsg_trim:
986 nla_put_failure:
987 nlmsg_trim(skb, b);
988 return -1;
989 }
990
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)991 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
992 {
993 if (q->flags & TCQ_F_BUILTIN)
994 return true;
995 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
996 return true;
997
998 return false;
999 }
1000
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new,struct netlink_ext_ack * extack)1001 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1002 struct nlmsghdr *n, u32 clid,
1003 struct Qdisc *old, struct Qdisc *new,
1004 struct netlink_ext_ack *extack)
1005 {
1006 struct sk_buff *skb;
1007 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1008
1009 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1010 if (!skb)
1011 return -ENOBUFS;
1012
1013 if (old && !tc_qdisc_dump_ignore(old, false)) {
1014 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1015 0, RTM_DELQDISC, extack) < 0)
1016 goto err_out;
1017 }
1018 if (new && !tc_qdisc_dump_ignore(new, false)) {
1019 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1020 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1021 goto err_out;
1022 }
1023
1024 if (skb->len)
1025 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1026 n->nlmsg_flags & NLM_F_ECHO);
1027
1028 err_out:
1029 kfree_skb(skb);
1030 return -EINVAL;
1031 }
1032
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new,struct netlink_ext_ack * extack)1033 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1034 struct nlmsghdr *n, u32 clid,
1035 struct Qdisc *old, struct Qdisc *new,
1036 struct netlink_ext_ack *extack)
1037 {
1038 if (new || old)
1039 qdisc_notify(net, skb, n, clid, old, new, extack);
1040
1041 if (old)
1042 qdisc_put(old);
1043 }
1044
qdisc_clear_nolock(struct Qdisc * sch)1045 static void qdisc_clear_nolock(struct Qdisc *sch)
1046 {
1047 sch->flags &= ~TCQ_F_NOLOCK;
1048 if (!(sch->flags & TCQ_F_CPUSTATS))
1049 return;
1050
1051 free_percpu(sch->cpu_bstats);
1052 free_percpu(sch->cpu_qstats);
1053 sch->cpu_bstats = NULL;
1054 sch->cpu_qstats = NULL;
1055 sch->flags &= ~TCQ_F_CPUSTATS;
1056 }
1057
1058 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1059 * to device "dev".
1060 *
1061 * When appropriate send a netlink notification using 'skb'
1062 * and "n".
1063 *
1064 * On success, destroy old qdisc.
1065 */
1066
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)1067 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1068 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1069 struct Qdisc *new, struct Qdisc *old,
1070 struct netlink_ext_ack *extack)
1071 {
1072 struct Qdisc *q = old;
1073 struct net *net = dev_net(dev);
1074
1075 if (parent == NULL) {
1076 unsigned int i, num_q, ingress;
1077 struct netdev_queue *dev_queue;
1078
1079 ingress = 0;
1080 num_q = dev->num_tx_queues;
1081 if ((q && q->flags & TCQ_F_INGRESS) ||
1082 (new && new->flags & TCQ_F_INGRESS)) {
1083 ingress = 1;
1084 dev_queue = dev_ingress_queue(dev);
1085 if (!dev_queue) {
1086 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1087 return -ENOENT;
1088 }
1089
1090 q = rtnl_dereference(dev_queue->qdisc_sleeping);
1091
1092 /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1093 * __tcf_qdisc_find() for filter requests.
1094 */
1095 if (!qdisc_refcount_dec_if_one(q)) {
1096 NL_SET_ERR_MSG(extack,
1097 "Current ingress or clsact Qdisc has ongoing filter requests");
1098 return -EBUSY;
1099 }
1100 }
1101
1102 if (dev->flags & IFF_UP)
1103 dev_deactivate(dev);
1104
1105 qdisc_offload_graft_root(dev, new, old, extack);
1106
1107 if (new && new->ops->attach && !ingress)
1108 goto skip;
1109
1110 if (!ingress) {
1111 for (i = 0; i < num_q; i++) {
1112 dev_queue = netdev_get_tx_queue(dev, i);
1113 old = dev_graft_qdisc(dev_queue, new);
1114
1115 if (new && i > 0)
1116 qdisc_refcount_inc(new);
1117 qdisc_put(old);
1118 }
1119 } else {
1120 old = dev_graft_qdisc(dev_queue, NULL);
1121
1122 /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1123 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1124 * pointer(s) in mini_qdisc_pair_swap().
1125 */
1126 qdisc_notify(net, skb, n, classid, old, new, extack);
1127 qdisc_destroy(old);
1128
1129 dev_graft_qdisc(dev_queue, new);
1130 }
1131
1132 skip:
1133 if (!ingress) {
1134 old = rtnl_dereference(dev->qdisc);
1135 if (new && !new->ops->attach)
1136 qdisc_refcount_inc(new);
1137 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1138
1139 notify_and_destroy(net, skb, n, classid, old, new, extack);
1140
1141 if (new && new->ops->attach)
1142 new->ops->attach(new);
1143 }
1144
1145 if (dev->flags & IFF_UP)
1146 dev_activate(dev);
1147 } else {
1148 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1149 unsigned long cl;
1150 int err;
1151
1152 /* Only support running class lockless if parent is lockless */
1153 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1154 qdisc_clear_nolock(new);
1155
1156 if (!cops || !cops->graft)
1157 return -EOPNOTSUPP;
1158
1159 cl = cops->find(parent, classid);
1160 if (!cl) {
1161 NL_SET_ERR_MSG(extack, "Specified class not found");
1162 return -ENOENT;
1163 }
1164
1165 if (new && new->ops == &noqueue_qdisc_ops) {
1166 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1167 return -EINVAL;
1168 }
1169
1170 if (new &&
1171 !(parent->flags & TCQ_F_MQROOT) &&
1172 rcu_access_pointer(new->stab)) {
1173 NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
1174 return -EINVAL;
1175 }
1176 err = cops->graft(parent, cl, new, &old, extack);
1177 if (err)
1178 return err;
1179 notify_and_destroy(net, skb, n, classid, old, new, extack);
1180 }
1181 return 0;
1182 }
1183
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1184 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1185 struct netlink_ext_ack *extack)
1186 {
1187 u32 block_index;
1188
1189 if (tca[TCA_INGRESS_BLOCK]) {
1190 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1191
1192 if (!block_index) {
1193 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1194 return -EINVAL;
1195 }
1196 if (!sch->ops->ingress_block_set) {
1197 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1198 return -EOPNOTSUPP;
1199 }
1200 sch->ops->ingress_block_set(sch, block_index);
1201 }
1202 if (tca[TCA_EGRESS_BLOCK]) {
1203 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1204
1205 if (!block_index) {
1206 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1207 return -EINVAL;
1208 }
1209 if (!sch->ops->egress_block_set) {
1210 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1211 return -EOPNOTSUPP;
1212 }
1213 sch->ops->egress_block_set(sch, block_index);
1214 }
1215 return 0;
1216 }
1217
1218 /*
1219 Allocate and initialize new qdisc.
1220
1221 Parameters are passed via opt.
1222 */
1223
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1224 static struct Qdisc *qdisc_create(struct net_device *dev,
1225 struct netdev_queue *dev_queue,
1226 u32 parent, u32 handle,
1227 struct nlattr **tca, int *errp,
1228 struct netlink_ext_ack *extack)
1229 {
1230 int err;
1231 struct nlattr *kind = tca[TCA_KIND];
1232 struct Qdisc *sch;
1233 struct Qdisc_ops *ops;
1234 struct qdisc_size_table *stab;
1235
1236 ops = qdisc_lookup_ops(kind);
1237 #ifdef CONFIG_MODULES
1238 if (ops == NULL && kind != NULL) {
1239 char name[IFNAMSIZ];
1240 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1241 /* We dropped the RTNL semaphore in order to
1242 * perform the module load. So, even if we
1243 * succeeded in loading the module we have to
1244 * tell the caller to replay the request. We
1245 * indicate this using -EAGAIN.
1246 * We replay the request because the device may
1247 * go away in the mean time.
1248 */
1249 rtnl_unlock();
1250 request_module("sch_%s", name);
1251 rtnl_lock();
1252 ops = qdisc_lookup_ops(kind);
1253 if (ops != NULL) {
1254 /* We will try again qdisc_lookup_ops,
1255 * so don't keep a reference.
1256 */
1257 module_put(ops->owner);
1258 err = -EAGAIN;
1259 goto err_out;
1260 }
1261 }
1262 }
1263 #endif
1264
1265 err = -ENOENT;
1266 if (!ops) {
1267 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1268 goto err_out;
1269 }
1270
1271 sch = qdisc_alloc(dev_queue, ops, extack);
1272 if (IS_ERR(sch)) {
1273 err = PTR_ERR(sch);
1274 goto err_out2;
1275 }
1276
1277 sch->parent = parent;
1278
1279 if (handle == TC_H_INGRESS) {
1280 if (!(sch->flags & TCQ_F_INGRESS)) {
1281 NL_SET_ERR_MSG(extack,
1282 "Specified parent ID is reserved for ingress and clsact Qdiscs");
1283 err = -EINVAL;
1284 goto err_out3;
1285 }
1286 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1287 } else {
1288 if (handle == 0) {
1289 handle = qdisc_alloc_handle(dev);
1290 if (handle == 0) {
1291 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1292 err = -ENOSPC;
1293 goto err_out3;
1294 }
1295 }
1296 if (!netif_is_multiqueue(dev))
1297 sch->flags |= TCQ_F_ONETXQUEUE;
1298 }
1299
1300 sch->handle = handle;
1301
1302 /* This exist to keep backward compatible with a userspace
1303 * loophole, what allowed userspace to get IFF_NO_QUEUE
1304 * facility on older kernels by setting tx_queue_len=0 (prior
1305 * to qdisc init), and then forgot to reinit tx_queue_len
1306 * before again attaching a qdisc.
1307 */
1308 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1309 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1310 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1311 }
1312
1313 err = qdisc_block_indexes_set(sch, tca, extack);
1314 if (err)
1315 goto err_out3;
1316
1317 if (tca[TCA_STAB]) {
1318 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1319 if (IS_ERR(stab)) {
1320 err = PTR_ERR(stab);
1321 goto err_out3;
1322 }
1323 rcu_assign_pointer(sch->stab, stab);
1324 }
1325
1326 if (ops->init) {
1327 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1328 if (err != 0)
1329 goto err_out4;
1330 }
1331
1332 if (tca[TCA_RATE]) {
1333 err = -EOPNOTSUPP;
1334 if (sch->flags & TCQ_F_MQROOT) {
1335 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1336 goto err_out4;
1337 }
1338
1339 err = gen_new_estimator(&sch->bstats,
1340 sch->cpu_bstats,
1341 &sch->rate_est,
1342 NULL,
1343 true,
1344 tca[TCA_RATE]);
1345 if (err) {
1346 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1347 goto err_out4;
1348 }
1349 }
1350
1351 qdisc_hash_add(sch, false);
1352 trace_qdisc_create(ops, dev, parent);
1353
1354 return sch;
1355
1356 err_out4:
1357 /* Even if ops->init() failed, we call ops->destroy()
1358 * like qdisc_create_dflt().
1359 */
1360 if (ops->destroy)
1361 ops->destroy(sch);
1362 qdisc_put_stab(rtnl_dereference(sch->stab));
1363 err_out3:
1364 lockdep_unregister_key(&sch->root_lock_key);
1365 netdev_put(dev, &sch->dev_tracker);
1366 qdisc_free(sch);
1367 err_out2:
1368 module_put(ops->owner);
1369 err_out:
1370 *errp = err;
1371 return NULL;
1372 }
1373
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1374 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1375 struct netlink_ext_ack *extack)
1376 {
1377 struct qdisc_size_table *ostab, *stab = NULL;
1378 int err = 0;
1379
1380 if (tca[TCA_OPTIONS]) {
1381 if (!sch->ops->change) {
1382 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1383 return -EINVAL;
1384 }
1385 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1386 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1387 return -EOPNOTSUPP;
1388 }
1389 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1390 if (err)
1391 return err;
1392 }
1393
1394 if (tca[TCA_STAB]) {
1395 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1396 if (IS_ERR(stab))
1397 return PTR_ERR(stab);
1398 }
1399
1400 ostab = rtnl_dereference(sch->stab);
1401 rcu_assign_pointer(sch->stab, stab);
1402 qdisc_put_stab(ostab);
1403
1404 if (tca[TCA_RATE]) {
1405 /* NB: ignores errors from replace_estimator
1406 because change can't be undone. */
1407 if (sch->flags & TCQ_F_MQROOT)
1408 goto out;
1409 gen_replace_estimator(&sch->bstats,
1410 sch->cpu_bstats,
1411 &sch->rate_est,
1412 NULL,
1413 true,
1414 tca[TCA_RATE]);
1415 }
1416 out:
1417 return 0;
1418 }
1419
1420 struct check_loop_arg {
1421 struct qdisc_walker w;
1422 struct Qdisc *p;
1423 int depth;
1424 };
1425
1426 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1427 struct qdisc_walker *w);
1428
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1429 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1430 {
1431 struct check_loop_arg arg;
1432
1433 if (q->ops->cl_ops == NULL)
1434 return 0;
1435
1436 arg.w.stop = arg.w.skip = arg.w.count = 0;
1437 arg.w.fn = check_loop_fn;
1438 arg.depth = depth;
1439 arg.p = p;
1440 q->ops->cl_ops->walk(q, &arg.w);
1441 return arg.w.stop ? -ELOOP : 0;
1442 }
1443
1444 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1445 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1446 {
1447 struct Qdisc *leaf;
1448 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1449 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1450
1451 leaf = cops->leaf(q, cl);
1452 if (leaf) {
1453 if (leaf == arg->p || arg->depth > 7)
1454 return -ELOOP;
1455 return check_loop(leaf, arg->p, arg->depth + 1);
1456 }
1457 return 0;
1458 }
1459
1460 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1461 [TCA_KIND] = { .type = NLA_STRING },
1462 [TCA_RATE] = { .type = NLA_BINARY,
1463 .len = sizeof(struct tc_estimator) },
1464 [TCA_STAB] = { .type = NLA_NESTED },
1465 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1466 [TCA_CHAIN] = { .type = NLA_U32 },
1467 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1468 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1469 };
1470
1471 /*
1472 * Delete/get qdisc.
1473 */
1474
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1475 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1476 struct netlink_ext_ack *extack)
1477 {
1478 struct net *net = sock_net(skb->sk);
1479 struct tcmsg *tcm = nlmsg_data(n);
1480 struct nlattr *tca[TCA_MAX + 1];
1481 struct net_device *dev;
1482 u32 clid;
1483 struct Qdisc *q = NULL;
1484 struct Qdisc *p = NULL;
1485 int err;
1486
1487 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1488 rtm_tca_policy, extack);
1489 if (err < 0)
1490 return err;
1491
1492 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1493 if (!dev)
1494 return -ENODEV;
1495
1496 clid = tcm->tcm_parent;
1497 if (clid) {
1498 if (clid != TC_H_ROOT) {
1499 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1500 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1501 if (!p) {
1502 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1503 return -ENOENT;
1504 }
1505 q = qdisc_leaf(p, clid, extack);
1506 } else if (dev_ingress_queue(dev)) {
1507 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1508 }
1509 } else {
1510 q = rtnl_dereference(dev->qdisc);
1511 }
1512 if (!q) {
1513 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1514 return -ENOENT;
1515 }
1516 if (IS_ERR(q))
1517 return PTR_ERR(q);
1518
1519 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1520 NL_SET_ERR_MSG(extack, "Invalid handle");
1521 return -EINVAL;
1522 }
1523 } else {
1524 q = qdisc_lookup(dev, tcm->tcm_handle);
1525 if (!q) {
1526 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1527 return -ENOENT;
1528 }
1529 }
1530
1531 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1532 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1533 return -EINVAL;
1534 }
1535
1536 if (n->nlmsg_type == RTM_DELQDISC) {
1537 if (!clid) {
1538 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1539 return -EINVAL;
1540 }
1541 if (q->handle == 0) {
1542 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1543 return -ENOENT;
1544 }
1545 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1546 if (err != 0)
1547 return err;
1548 } else {
1549 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1550 }
1551 return 0;
1552 }
1553
req_create_or_replace(struct nlmsghdr * n)1554 static bool req_create_or_replace(struct nlmsghdr *n)
1555 {
1556 return (n->nlmsg_flags & NLM_F_CREATE &&
1557 n->nlmsg_flags & NLM_F_REPLACE);
1558 }
1559
req_create_exclusive(struct nlmsghdr * n)1560 static bool req_create_exclusive(struct nlmsghdr *n)
1561 {
1562 return (n->nlmsg_flags & NLM_F_CREATE &&
1563 n->nlmsg_flags & NLM_F_EXCL);
1564 }
1565
req_change(struct nlmsghdr * n)1566 static bool req_change(struct nlmsghdr *n)
1567 {
1568 return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1569 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1570 !(n->nlmsg_flags & NLM_F_EXCL));
1571 }
1572
1573 /*
1574 * Create/change qdisc.
1575 */
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1576 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1577 struct netlink_ext_ack *extack)
1578 {
1579 struct net *net = sock_net(skb->sk);
1580 struct tcmsg *tcm;
1581 struct nlattr *tca[TCA_MAX + 1];
1582 struct net_device *dev;
1583 u32 clid;
1584 struct Qdisc *q, *p;
1585 int err;
1586
1587 replay:
1588 /* Reinit, just in case something touches this. */
1589 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1590 rtm_tca_policy, extack);
1591 if (err < 0)
1592 return err;
1593
1594 tcm = nlmsg_data(n);
1595 clid = tcm->tcm_parent;
1596 q = p = NULL;
1597
1598 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1599 if (!dev)
1600 return -ENODEV;
1601
1602
1603 if (clid) {
1604 if (clid != TC_H_ROOT) {
1605 if (clid != TC_H_INGRESS) {
1606 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1607 if (!p) {
1608 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1609 return -ENOENT;
1610 }
1611 q = qdisc_leaf(p, clid, extack);
1612 if (IS_ERR(q))
1613 return PTR_ERR(q);
1614 } else if (dev_ingress_queue_create(dev)) {
1615 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1616 }
1617 } else {
1618 q = rtnl_dereference(dev->qdisc);
1619 }
1620
1621 /* It may be default qdisc, ignore it */
1622 if (q && q->handle == 0)
1623 q = NULL;
1624
1625 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1626 if (tcm->tcm_handle) {
1627 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1628 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1629 return -EEXIST;
1630 }
1631 if (TC_H_MIN(tcm->tcm_handle)) {
1632 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1633 return -EINVAL;
1634 }
1635 q = qdisc_lookup(dev, tcm->tcm_handle);
1636 if (!q)
1637 goto create_n_graft;
1638 if (q->parent != tcm->tcm_parent) {
1639 NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent");
1640 return -EINVAL;
1641 }
1642 if (n->nlmsg_flags & NLM_F_EXCL) {
1643 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1644 return -EEXIST;
1645 }
1646 if (tca[TCA_KIND] &&
1647 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1648 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1649 return -EINVAL;
1650 }
1651 if (q->flags & TCQ_F_INGRESS) {
1652 NL_SET_ERR_MSG(extack,
1653 "Cannot regraft ingress or clsact Qdiscs");
1654 return -EINVAL;
1655 }
1656 if (q == p ||
1657 (p && check_loop(q, p, 0))) {
1658 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1659 return -ELOOP;
1660 }
1661 if (clid == TC_H_INGRESS) {
1662 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1663 return -EINVAL;
1664 }
1665 qdisc_refcount_inc(q);
1666 goto graft;
1667 } else {
1668 if (!q)
1669 goto create_n_graft;
1670
1671 /* This magic test requires explanation.
1672 *
1673 * We know, that some child q is already
1674 * attached to this parent and have choice:
1675 * 1) change it or 2) create/graft new one.
1676 * If the requested qdisc kind is different
1677 * than the existing one, then we choose graft.
1678 * If they are the same then this is "change"
1679 * operation - just let it fallthrough..
1680 *
1681 * 1. We are allowed to create/graft only
1682 * if the request is explicitly stating
1683 * "please create if it doesn't exist".
1684 *
1685 * 2. If the request is to exclusive create
1686 * then the qdisc tcm_handle is not expected
1687 * to exist, so that we choose create/graft too.
1688 *
1689 * 3. The last case is when no flags are set.
1690 * This will happen when for example tc
1691 * utility issues a "change" command.
1692 * Alas, it is sort of hole in API, we
1693 * cannot decide what to do unambiguously.
1694 * For now we select create/graft.
1695 */
1696 if (tca[TCA_KIND] &&
1697 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1698 if (req_create_or_replace(n) ||
1699 req_create_exclusive(n))
1700 goto create_n_graft;
1701 else if (req_change(n))
1702 goto create_n_graft2;
1703 }
1704 }
1705 }
1706 } else {
1707 if (!tcm->tcm_handle) {
1708 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1709 return -EINVAL;
1710 }
1711 q = qdisc_lookup(dev, tcm->tcm_handle);
1712 }
1713
1714 /* Change qdisc parameters */
1715 if (!q) {
1716 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1717 return -ENOENT;
1718 }
1719 if (n->nlmsg_flags & NLM_F_EXCL) {
1720 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1721 return -EEXIST;
1722 }
1723 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1724 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1725 return -EINVAL;
1726 }
1727 err = qdisc_change(q, tca, extack);
1728 if (err == 0)
1729 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1730 return err;
1731
1732 create_n_graft:
1733 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1734 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1735 return -ENOENT;
1736 }
1737 create_n_graft2:
1738 if (clid == TC_H_INGRESS) {
1739 if (dev_ingress_queue(dev)) {
1740 q = qdisc_create(dev, dev_ingress_queue(dev),
1741 tcm->tcm_parent, tcm->tcm_parent,
1742 tca, &err, extack);
1743 } else {
1744 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1745 err = -ENOENT;
1746 }
1747 } else {
1748 struct netdev_queue *dev_queue;
1749
1750 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1751 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1752 else if (p)
1753 dev_queue = p->dev_queue;
1754 else
1755 dev_queue = netdev_get_tx_queue(dev, 0);
1756
1757 q = qdisc_create(dev, dev_queue,
1758 tcm->tcm_parent, tcm->tcm_handle,
1759 tca, &err, extack);
1760 }
1761 if (q == NULL) {
1762 if (err == -EAGAIN)
1763 goto replay;
1764 return err;
1765 }
1766
1767 graft:
1768 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1769 if (err) {
1770 if (q)
1771 qdisc_put(q);
1772 return err;
1773 }
1774
1775 return 0;
1776 }
1777
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1778 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1779 struct netlink_callback *cb,
1780 int *q_idx_p, int s_q_idx, bool recur,
1781 bool dump_invisible)
1782 {
1783 int ret = 0, q_idx = *q_idx_p;
1784 struct Qdisc *q;
1785 int b;
1786
1787 if (!root)
1788 return 0;
1789
1790 q = root;
1791 if (q_idx < s_q_idx) {
1792 q_idx++;
1793 } else {
1794 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1795 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1796 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1797 RTM_NEWQDISC, NULL) <= 0)
1798 goto done;
1799 q_idx++;
1800 }
1801
1802 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1803 * itself has already been dumped.
1804 *
1805 * If we've already dumped the top-level (ingress) qdisc above and the global
1806 * qdisc hashtable, we don't want to hit it again
1807 */
1808 if (!qdisc_dev(root) || !recur)
1809 goto out;
1810
1811 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1812 if (q_idx < s_q_idx) {
1813 q_idx++;
1814 continue;
1815 }
1816 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1817 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1818 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1819 RTM_NEWQDISC, NULL) <= 0)
1820 goto done;
1821 q_idx++;
1822 }
1823
1824 out:
1825 *q_idx_p = q_idx;
1826 return ret;
1827 done:
1828 ret = -1;
1829 goto out;
1830 }
1831
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1832 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1833 {
1834 struct net *net = sock_net(skb->sk);
1835 int idx, q_idx;
1836 int s_idx, s_q_idx;
1837 struct net_device *dev;
1838 const struct nlmsghdr *nlh = cb->nlh;
1839 struct nlattr *tca[TCA_MAX + 1];
1840 int err;
1841
1842 s_idx = cb->args[0];
1843 s_q_idx = q_idx = cb->args[1];
1844
1845 idx = 0;
1846 ASSERT_RTNL();
1847
1848 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1849 rtm_tca_policy, cb->extack);
1850 if (err < 0)
1851 return err;
1852
1853 for_each_netdev(net, dev) {
1854 struct netdev_queue *dev_queue;
1855
1856 if (idx < s_idx)
1857 goto cont;
1858 if (idx > s_idx)
1859 s_q_idx = 0;
1860 q_idx = 0;
1861
1862 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1863 skb, cb, &q_idx, s_q_idx,
1864 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1865 goto done;
1866
1867 dev_queue = dev_ingress_queue(dev);
1868 if (dev_queue &&
1869 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1870 skb, cb, &q_idx, s_q_idx, false,
1871 tca[TCA_DUMP_INVISIBLE]) < 0)
1872 goto done;
1873
1874 cont:
1875 idx++;
1876 }
1877
1878 done:
1879 cb->args[0] = idx;
1880 cb->args[1] = q_idx;
1881
1882 return skb->len;
1883 }
1884
1885
1886
1887 /************************************************
1888 * Traffic classes manipulation. *
1889 ************************************************/
1890
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event,struct netlink_ext_ack * extack)1891 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1892 unsigned long cl, u32 portid, u32 seq, u16 flags,
1893 int event, struct netlink_ext_ack *extack)
1894 {
1895 struct tcmsg *tcm;
1896 struct nlmsghdr *nlh;
1897 unsigned char *b = skb_tail_pointer(skb);
1898 struct gnet_dump d;
1899 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1900
1901 cond_resched();
1902 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1903 if (!nlh)
1904 goto out_nlmsg_trim;
1905 tcm = nlmsg_data(nlh);
1906 tcm->tcm_family = AF_UNSPEC;
1907 tcm->tcm__pad1 = 0;
1908 tcm->tcm__pad2 = 0;
1909 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1910 tcm->tcm_parent = q->handle;
1911 tcm->tcm_handle = q->handle;
1912 tcm->tcm_info = 0;
1913 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1914 goto nla_put_failure;
1915 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1916 goto nla_put_failure;
1917
1918 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1919 NULL, &d, TCA_PAD) < 0)
1920 goto nla_put_failure;
1921
1922 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1923 goto nla_put_failure;
1924
1925 if (gnet_stats_finish_copy(&d) < 0)
1926 goto nla_put_failure;
1927
1928 if (extack && extack->_msg &&
1929 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1930 goto out_nlmsg_trim;
1931
1932 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1933
1934 return skb->len;
1935
1936 out_nlmsg_trim:
1937 nla_put_failure:
1938 nlmsg_trim(skb, b);
1939 return -1;
1940 }
1941
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event,struct netlink_ext_ack * extack)1942 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1943 struct nlmsghdr *n, struct Qdisc *q,
1944 unsigned long cl, int event, struct netlink_ext_ack *extack)
1945 {
1946 struct sk_buff *skb;
1947 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1948
1949 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1950 if (!skb)
1951 return -ENOBUFS;
1952
1953 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1954 kfree_skb(skb);
1955 return -EINVAL;
1956 }
1957
1958 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1959 n->nlmsg_flags & NLM_F_ECHO);
1960 }
1961
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,struct netlink_ext_ack * extack)1962 static int tclass_del_notify(struct net *net,
1963 const struct Qdisc_class_ops *cops,
1964 struct sk_buff *oskb, struct nlmsghdr *n,
1965 struct Qdisc *q, unsigned long cl,
1966 struct netlink_ext_ack *extack)
1967 {
1968 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1969 struct sk_buff *skb;
1970 int err = 0;
1971
1972 if (!cops->delete)
1973 return -EOPNOTSUPP;
1974
1975 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1976 if (!skb)
1977 return -ENOBUFS;
1978
1979 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1980 RTM_DELTCLASS, extack) < 0) {
1981 kfree_skb(skb);
1982 return -EINVAL;
1983 }
1984
1985 err = cops->delete(q, cl, extack);
1986 if (err) {
1987 kfree_skb(skb);
1988 return err;
1989 }
1990
1991 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1992 n->nlmsg_flags & NLM_F_ECHO);
1993 return err;
1994 }
1995
1996 #ifdef CONFIG_NET_CLS
1997
1998 struct tcf_bind_args {
1999 struct tcf_walker w;
2000 unsigned long base;
2001 unsigned long cl;
2002 u32 classid;
2003 };
2004
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)2005 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2006 {
2007 struct tcf_bind_args *a = (void *)arg;
2008
2009 if (n && tp->ops->bind_class) {
2010 struct Qdisc *q = tcf_block_q(tp->chain->block);
2011
2012 sch_tree_lock(q);
2013 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2014 sch_tree_unlock(q);
2015 }
2016 return 0;
2017 }
2018
2019 struct tc_bind_class_args {
2020 struct qdisc_walker w;
2021 unsigned long new_cl;
2022 u32 portid;
2023 u32 clid;
2024 };
2025
tc_bind_class_walker(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)2026 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2027 struct qdisc_walker *w)
2028 {
2029 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2030 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2031 struct tcf_block *block;
2032 struct tcf_chain *chain;
2033
2034 block = cops->tcf_block(q, cl, NULL);
2035 if (!block)
2036 return 0;
2037 for (chain = tcf_get_next_chain(block, NULL);
2038 chain;
2039 chain = tcf_get_next_chain(block, chain)) {
2040 struct tcf_proto *tp;
2041
2042 for (tp = tcf_get_next_proto(chain, NULL);
2043 tp; tp = tcf_get_next_proto(chain, tp)) {
2044 struct tcf_bind_args arg = {};
2045
2046 arg.w.fn = tcf_node_bind;
2047 arg.classid = a->clid;
2048 arg.base = cl;
2049 arg.cl = a->new_cl;
2050 tp->ops->walk(tp, &arg.w, true);
2051 }
2052 }
2053
2054 return 0;
2055 }
2056
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2057 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2058 unsigned long new_cl)
2059 {
2060 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2061 struct tc_bind_class_args args = {};
2062
2063 if (!cops->tcf_block)
2064 return;
2065 args.portid = portid;
2066 args.clid = clid;
2067 args.new_cl = new_cl;
2068 args.w.fn = tc_bind_class_walker;
2069 q->ops->cl_ops->walk(q, &args.w);
2070 }
2071
2072 #else
2073
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2074 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2075 unsigned long new_cl)
2076 {
2077 }
2078
2079 #endif
2080
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)2081 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2082 struct netlink_ext_ack *extack)
2083 {
2084 struct net *net = sock_net(skb->sk);
2085 struct tcmsg *tcm = nlmsg_data(n);
2086 struct nlattr *tca[TCA_MAX + 1];
2087 struct net_device *dev;
2088 struct Qdisc *q = NULL;
2089 const struct Qdisc_class_ops *cops;
2090 unsigned long cl = 0;
2091 unsigned long new_cl;
2092 u32 portid;
2093 u32 clid;
2094 u32 qid;
2095 int err;
2096
2097 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2098 rtm_tca_policy, extack);
2099 if (err < 0)
2100 return err;
2101
2102 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2103 if (!dev)
2104 return -ENODEV;
2105
2106 /*
2107 parent == TC_H_UNSPEC - unspecified parent.
2108 parent == TC_H_ROOT - class is root, which has no parent.
2109 parent == X:0 - parent is root class.
2110 parent == X:Y - parent is a node in hierarchy.
2111 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2112
2113 handle == 0:0 - generate handle from kernel pool.
2114 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2115 handle == X:Y - clear.
2116 handle == X:0 - root class.
2117 */
2118
2119 /* Step 1. Determine qdisc handle X:0 */
2120
2121 portid = tcm->tcm_parent;
2122 clid = tcm->tcm_handle;
2123 qid = TC_H_MAJ(clid);
2124
2125 if (portid != TC_H_ROOT) {
2126 u32 qid1 = TC_H_MAJ(portid);
2127
2128 if (qid && qid1) {
2129 /* If both majors are known, they must be identical. */
2130 if (qid != qid1)
2131 return -EINVAL;
2132 } else if (qid1) {
2133 qid = qid1;
2134 } else if (qid == 0)
2135 qid = rtnl_dereference(dev->qdisc)->handle;
2136
2137 /* Now qid is genuine qdisc handle consistent
2138 * both with parent and child.
2139 *
2140 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2141 */
2142 if (portid)
2143 portid = TC_H_MAKE(qid, portid);
2144 } else {
2145 if (qid == 0)
2146 qid = rtnl_dereference(dev->qdisc)->handle;
2147 }
2148
2149 /* OK. Locate qdisc */
2150 q = qdisc_lookup(dev, qid);
2151 if (!q)
2152 return -ENOENT;
2153
2154 /* An check that it supports classes */
2155 cops = q->ops->cl_ops;
2156 if (cops == NULL)
2157 return -EINVAL;
2158
2159 /* Now try to get class */
2160 if (clid == 0) {
2161 if (portid == TC_H_ROOT)
2162 clid = qid;
2163 } else
2164 clid = TC_H_MAKE(qid, clid);
2165
2166 if (clid)
2167 cl = cops->find(q, clid);
2168
2169 if (cl == 0) {
2170 err = -ENOENT;
2171 if (n->nlmsg_type != RTM_NEWTCLASS ||
2172 !(n->nlmsg_flags & NLM_F_CREATE))
2173 goto out;
2174 } else {
2175 switch (n->nlmsg_type) {
2176 case RTM_NEWTCLASS:
2177 err = -EEXIST;
2178 if (n->nlmsg_flags & NLM_F_EXCL)
2179 goto out;
2180 break;
2181 case RTM_DELTCLASS:
2182 err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2183 /* Unbind the class with flilters with 0 */
2184 tc_bind_tclass(q, portid, clid, 0);
2185 goto out;
2186 case RTM_GETTCLASS:
2187 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2188 goto out;
2189 default:
2190 err = -EINVAL;
2191 goto out;
2192 }
2193 }
2194
2195 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2196 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2197 return -EOPNOTSUPP;
2198 }
2199
2200 /* Prevent creation of traffic classes with classid TC_H_ROOT */
2201 if (clid == TC_H_ROOT) {
2202 NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT");
2203 return -EINVAL;
2204 }
2205
2206 new_cl = cl;
2207 err = -EOPNOTSUPP;
2208 if (cops->change)
2209 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2210 if (err == 0) {
2211 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2212 /* We just create a new class, need to do reverse binding. */
2213 if (cl != new_cl)
2214 tc_bind_tclass(q, portid, clid, new_cl);
2215 }
2216 out:
2217 return err;
2218 }
2219
2220 struct qdisc_dump_args {
2221 struct qdisc_walker w;
2222 struct sk_buff *skb;
2223 struct netlink_callback *cb;
2224 };
2225
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2226 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2227 struct qdisc_walker *arg)
2228 {
2229 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2230
2231 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2232 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2233 RTM_NEWTCLASS, NULL);
2234 }
2235
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2236 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2237 struct tcmsg *tcm, struct netlink_callback *cb,
2238 int *t_p, int s_t)
2239 {
2240 struct qdisc_dump_args arg;
2241
2242 if (tc_qdisc_dump_ignore(q, false) ||
2243 *t_p < s_t || !q->ops->cl_ops ||
2244 (tcm->tcm_parent &&
2245 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2246 (*t_p)++;
2247 return 0;
2248 }
2249 if (*t_p > s_t)
2250 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2251 arg.w.fn = qdisc_class_dump;
2252 arg.skb = skb;
2253 arg.cb = cb;
2254 arg.w.stop = 0;
2255 arg.w.skip = cb->args[1];
2256 arg.w.count = 0;
2257 q->ops->cl_ops->walk(q, &arg.w);
2258 cb->args[1] = arg.w.count;
2259 if (arg.w.stop)
2260 return -1;
2261 (*t_p)++;
2262 return 0;
2263 }
2264
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t,bool recur)2265 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2266 struct tcmsg *tcm, struct netlink_callback *cb,
2267 int *t_p, int s_t, bool recur)
2268 {
2269 struct Qdisc *q;
2270 int b;
2271
2272 if (!root)
2273 return 0;
2274
2275 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2276 return -1;
2277
2278 if (!qdisc_dev(root) || !recur)
2279 return 0;
2280
2281 if (tcm->tcm_parent) {
2282 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2283 if (q && q != root &&
2284 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2285 return -1;
2286 return 0;
2287 }
2288 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2289 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2290 return -1;
2291 }
2292
2293 return 0;
2294 }
2295
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2296 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2297 {
2298 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2299 struct net *net = sock_net(skb->sk);
2300 struct netdev_queue *dev_queue;
2301 struct net_device *dev;
2302 int t, s_t;
2303
2304 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2305 return 0;
2306 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2307 if (!dev)
2308 return 0;
2309
2310 s_t = cb->args[0];
2311 t = 0;
2312
2313 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2314 skb, tcm, cb, &t, s_t, true) < 0)
2315 goto done;
2316
2317 dev_queue = dev_ingress_queue(dev);
2318 if (dev_queue &&
2319 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2320 skb, tcm, cb, &t, s_t, false) < 0)
2321 goto done;
2322
2323 done:
2324 cb->args[0] = t;
2325
2326 dev_put(dev);
2327 return skb->len;
2328 }
2329
2330 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2331 static int psched_show(struct seq_file *seq, void *v)
2332 {
2333 seq_printf(seq, "%08x %08x %08x %08x\n",
2334 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2335 1000000,
2336 (u32)NSEC_PER_SEC / hrtimer_resolution);
2337
2338 return 0;
2339 }
2340
psched_net_init(struct net * net)2341 static int __net_init psched_net_init(struct net *net)
2342 {
2343 struct proc_dir_entry *e;
2344
2345 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2346 if (e == NULL)
2347 return -ENOMEM;
2348
2349 return 0;
2350 }
2351
psched_net_exit(struct net * net)2352 static void __net_exit psched_net_exit(struct net *net)
2353 {
2354 remove_proc_entry("psched", net->proc_net);
2355 }
2356 #else
psched_net_init(struct net * net)2357 static int __net_init psched_net_init(struct net *net)
2358 {
2359 return 0;
2360 }
2361
psched_net_exit(struct net * net)2362 static void __net_exit psched_net_exit(struct net *net)
2363 {
2364 }
2365 #endif
2366
2367 static struct pernet_operations psched_net_ops = {
2368 .init = psched_net_init,
2369 .exit = psched_net_exit,
2370 };
2371
2372 #if IS_ENABLED(CONFIG_RETPOLINE)
2373 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2374 #endif
2375
pktsched_init(void)2376 static int __init pktsched_init(void)
2377 {
2378 int err;
2379
2380 err = register_pernet_subsys(&psched_net_ops);
2381 if (err) {
2382 pr_err("pktsched_init: "
2383 "cannot initialize per netns operations\n");
2384 return err;
2385 }
2386
2387 register_qdisc(&pfifo_fast_ops);
2388 register_qdisc(&pfifo_qdisc_ops);
2389 register_qdisc(&bfifo_qdisc_ops);
2390 register_qdisc(&pfifo_head_drop_qdisc_ops);
2391 register_qdisc(&mq_qdisc_ops);
2392 register_qdisc(&noqueue_qdisc_ops);
2393
2394 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2395 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2396 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2397 0);
2398 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2399 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2400 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2401 0);
2402
2403 tc_wrapper_init();
2404
2405 return 0;
2406 }
2407
2408 subsys_initcall(pktsched_init);
2409