1 /* 2 * net/sched/sch_api.c Packet scheduler API. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Fixes: 12 * 13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 16 */ 17 18 #include <linux/module.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/string.h> 22 #include <linux/errno.h> 23 #include <linux/skbuff.h> 24 #include <linux/init.h> 25 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 27 #include <linux/kmod.h> 28 #include <linux/list.h> 29 #include <linux/hrtimer.h> 30 #include <linux/slab.h> 31 #include <linux/hashtable.h> 32 33 #include <net/net_namespace.h> 34 #include <net/sock.h> 35 #include <net/netlink.h> 36 #include <net/pkt_sched.h> 37 #include <net/pkt_cls.h> 38 39 /* 40 41 Short review. 42 ------------- 43 44 This file consists of two interrelated parts: 45 46 1. queueing disciplines manager frontend. 47 2. traffic classes manager frontend. 48 49 Generally, queueing discipline ("qdisc") is a black box, 50 which is able to enqueue packets and to dequeue them (when 51 device is ready to send something) in order and at times 52 determined by algorithm hidden in it. 53 54 qdisc's are divided to two categories: 55 - "queues", which have no internal structure visible from outside. 56 - "schedulers", which split all the packets to "traffic classes", 57 using "packet classifiers" (look at cls_api.c) 58 59 In turn, classes may have child qdiscs (as rule, queues) 60 attached to them etc. etc. etc. 61 62 The goal of the routines in this file is to translate 63 information supplied by user in the form of handles 64 to more intelligible for kernel form, to make some sanity 65 checks and part of work, which is common to all qdiscs 66 and to provide rtnetlink notifications. 67 68 All real intelligent work is done inside qdisc modules. 69 70 71 72 Every discipline has two major routines: enqueue and dequeue. 73 74 ---dequeue 75 76 dequeue usually returns a skb to send. It is allowed to return NULL, 77 but it does not mean that queue is empty, it just means that 78 discipline does not want to send anything this time. 79 Queue is really empty if q->q.qlen == 0. 80 For complicated disciplines with multiple queues q->q is not 81 real packet queue, but however q->q.qlen must be valid. 82 83 ---enqueue 84 85 enqueue returns 0, if packet was enqueued successfully. 86 If packet (this one or another one) was dropped, it returns 87 not zero error code. 88 NET_XMIT_DROP - this packet dropped 89 Expected action: do not backoff, but wait until queue will clear. 90 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 91 Expected action: backoff or ignore 92 93 Auxiliary routines: 94 95 ---peek 96 97 like dequeue but without removing a packet from the queue 98 99 ---reset 100 101 returns qdisc to initial state: purge all buffers, clear all 102 timers, counters (except for statistics) etc. 103 104 ---init 105 106 initializes newly created qdisc. 107 108 ---destroy 109 110 destroys resources allocated by init and during lifetime of qdisc. 111 112 ---change 113 114 changes qdisc parameters. 115 */ 116 117 /* Protects list of registered TC modules. It is pure SMP lock. */ 118 static DEFINE_RWLOCK(qdisc_mod_lock); 119 120 121 /************************************************ 122 * Queueing disciplines manipulation. * 123 ************************************************/ 124 125 126 /* The list of all installed queueing disciplines. */ 127 128 static struct Qdisc_ops *qdisc_base; 129 130 /* Register/unregister queueing discipline */ 131 132 int register_qdisc(struct Qdisc_ops *qops) 133 { 134 struct Qdisc_ops *q, **qp; 135 int rc = -EEXIST; 136 137 write_lock(&qdisc_mod_lock); 138 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 139 if (!strcmp(qops->id, q->id)) 140 goto out; 141 142 if (qops->enqueue == NULL) 143 qops->enqueue = noop_qdisc_ops.enqueue; 144 if (qops->peek == NULL) { 145 if (qops->dequeue == NULL) 146 qops->peek = noop_qdisc_ops.peek; 147 else 148 goto out_einval; 149 } 150 if (qops->dequeue == NULL) 151 qops->dequeue = noop_qdisc_ops.dequeue; 152 153 if (qops->cl_ops) { 154 const struct Qdisc_class_ops *cops = qops->cl_ops; 155 156 if (!(cops->find && cops->walk && cops->leaf)) 157 goto out_einval; 158 159 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) 160 goto out_einval; 161 } 162 163 qops->next = NULL; 164 *qp = qops; 165 rc = 0; 166 out: 167 write_unlock(&qdisc_mod_lock); 168 return rc; 169 170 out_einval: 171 rc = -EINVAL; 172 goto out; 173 } 174 EXPORT_SYMBOL(register_qdisc); 175 176 int unregister_qdisc(struct Qdisc_ops *qops) 177 { 178 struct Qdisc_ops *q, **qp; 179 int err = -ENOENT; 180 181 write_lock(&qdisc_mod_lock); 182 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 183 if (q == qops) 184 break; 185 if (q) { 186 *qp = q->next; 187 q->next = NULL; 188 err = 0; 189 } 190 write_unlock(&qdisc_mod_lock); 191 return err; 192 } 193 EXPORT_SYMBOL(unregister_qdisc); 194 195 /* Get default qdisc if not otherwise specified */ 196 void qdisc_get_default(char *name, size_t len) 197 { 198 read_lock(&qdisc_mod_lock); 199 strlcpy(name, default_qdisc_ops->id, len); 200 read_unlock(&qdisc_mod_lock); 201 } 202 203 static struct Qdisc_ops *qdisc_lookup_default(const char *name) 204 { 205 struct Qdisc_ops *q = NULL; 206 207 for (q = qdisc_base; q; q = q->next) { 208 if (!strcmp(name, q->id)) { 209 if (!try_module_get(q->owner)) 210 q = NULL; 211 break; 212 } 213 } 214 215 return q; 216 } 217 218 /* Set new default qdisc to use */ 219 int qdisc_set_default(const char *name) 220 { 221 const struct Qdisc_ops *ops; 222 223 if (!capable(CAP_NET_ADMIN)) 224 return -EPERM; 225 226 write_lock(&qdisc_mod_lock); 227 ops = qdisc_lookup_default(name); 228 if (!ops) { 229 /* Not found, drop lock and try to load module */ 230 write_unlock(&qdisc_mod_lock); 231 request_module("sch_%s", name); 232 write_lock(&qdisc_mod_lock); 233 234 ops = qdisc_lookup_default(name); 235 } 236 237 if (ops) { 238 /* Set new default */ 239 module_put(default_qdisc_ops->owner); 240 default_qdisc_ops = ops; 241 } 242 write_unlock(&qdisc_mod_lock); 243 244 return ops ? 0 : -ENOENT; 245 } 246 247 #ifdef CONFIG_NET_SCH_DEFAULT 248 /* Set default value from kernel config */ 249 static int __init sch_default_qdisc(void) 250 { 251 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); 252 } 253 late_initcall(sch_default_qdisc); 254 #endif 255 256 /* We know handle. Find qdisc among all qdisc's attached to device 257 * (root qdisc, all its children, children of children etc.) 258 * Note: caller either uses rtnl or rcu_read_lock() 259 */ 260 261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) 262 { 263 struct Qdisc *q; 264 265 if (!qdisc_dev(root)) 266 return (root->handle == handle ? root : NULL); 267 268 if (!(root->flags & TCQ_F_BUILTIN) && 269 root->handle == handle) 270 return root; 271 272 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { 273 if (q->handle == handle) 274 return q; 275 } 276 return NULL; 277 } 278 279 void qdisc_hash_add(struct Qdisc *q, bool invisible) 280 { 281 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 282 ASSERT_RTNL(); 283 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); 284 if (invisible) 285 q->flags |= TCQ_F_INVISIBLE; 286 } 287 } 288 EXPORT_SYMBOL(qdisc_hash_add); 289 290 void qdisc_hash_del(struct Qdisc *q) 291 { 292 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { 293 ASSERT_RTNL(); 294 hash_del_rcu(&q->hash); 295 } 296 } 297 EXPORT_SYMBOL(qdisc_hash_del); 298 299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 300 { 301 struct Qdisc *q; 302 303 if (!handle) 304 return NULL; 305 q = qdisc_match_from_root(dev->qdisc, handle); 306 if (q) 307 goto out; 308 309 if (dev_ingress_queue(dev)) 310 q = qdisc_match_from_root( 311 dev_ingress_queue(dev)->qdisc_sleeping, 312 handle); 313 out: 314 return q; 315 } 316 317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) 318 { 319 struct netdev_queue *nq; 320 struct Qdisc *q; 321 322 if (!handle) 323 return NULL; 324 q = qdisc_match_from_root(dev->qdisc, handle); 325 if (q) 326 goto out; 327 328 nq = dev_ingress_queue_rcu(dev); 329 if (nq) 330 q = qdisc_match_from_root(nq->qdisc_sleeping, handle); 331 out: 332 return q; 333 } 334 335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 336 { 337 unsigned long cl; 338 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 339 340 if (cops == NULL) 341 return NULL; 342 cl = cops->find(p, classid); 343 344 if (cl == 0) 345 return NULL; 346 return cops->leaf(p, cl); 347 } 348 349 /* Find queueing discipline by name */ 350 351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) 352 { 353 struct Qdisc_ops *q = NULL; 354 355 if (kind) { 356 read_lock(&qdisc_mod_lock); 357 for (q = qdisc_base; q; q = q->next) { 358 if (nla_strcmp(kind, q->id) == 0) { 359 if (!try_module_get(q->owner)) 360 q = NULL; 361 break; 362 } 363 } 364 read_unlock(&qdisc_mod_lock); 365 } 366 return q; 367 } 368 369 /* The linklayer setting were not transferred from iproute2, in older 370 * versions, and the rate tables lookup systems have been dropped in 371 * the kernel. To keep backward compatible with older iproute2 tc 372 * utils, we detect the linklayer setting by detecting if the rate 373 * table were modified. 374 * 375 * For linklayer ATM table entries, the rate table will be aligned to 376 * 48 bytes, thus some table entries will contain the same value. The 377 * mpu (min packet unit) is also encoded into the old rate table, thus 378 * starting from the mpu, we find low and high table entries for 379 * mapping this cell. If these entries contain the same value, when 380 * the rate tables have been modified for linklayer ATM. 381 * 382 * This is done by rounding mpu to the nearest 48 bytes cell/entry, 383 * and then roundup to the next cell, calc the table entry one below, 384 * and compare. 385 */ 386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) 387 { 388 int low = roundup(r->mpu, 48); 389 int high = roundup(low+1, 48); 390 int cell_low = low >> r->cell_log; 391 int cell_high = (high >> r->cell_log) - 1; 392 393 /* rtab is too inaccurate at rates > 100Mbit/s */ 394 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { 395 pr_debug("TC linklayer: Giving up ATM detection\n"); 396 return TC_LINKLAYER_ETHERNET; 397 } 398 399 if ((cell_high > cell_low) && (cell_high < 256) 400 && (rtab[cell_low] == rtab[cell_high])) { 401 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", 402 cell_low, cell_high, rtab[cell_high]); 403 return TC_LINKLAYER_ATM; 404 } 405 return TC_LINKLAYER_ETHERNET; 406 } 407 408 static struct qdisc_rate_table *qdisc_rtab_list; 409 410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, 411 struct nlattr *tab, 412 struct netlink_ext_ack *extack) 413 { 414 struct qdisc_rate_table *rtab; 415 416 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || 417 nla_len(tab) != TC_RTAB_SIZE) { 418 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); 419 return NULL; 420 } 421 422 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 423 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && 424 !memcmp(&rtab->data, nla_data(tab), 1024)) { 425 rtab->refcnt++; 426 return rtab; 427 } 428 } 429 430 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 431 if (rtab) { 432 rtab->rate = *r; 433 rtab->refcnt = 1; 434 memcpy(rtab->data, nla_data(tab), 1024); 435 if (r->linklayer == TC_LINKLAYER_UNAWARE) 436 r->linklayer = __detect_linklayer(r, rtab->data); 437 rtab->next = qdisc_rtab_list; 438 qdisc_rtab_list = rtab; 439 } else { 440 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); 441 } 442 return rtab; 443 } 444 EXPORT_SYMBOL(qdisc_get_rtab); 445 446 void qdisc_put_rtab(struct qdisc_rate_table *tab) 447 { 448 struct qdisc_rate_table *rtab, **rtabp; 449 450 if (!tab || --tab->refcnt) 451 return; 452 453 for (rtabp = &qdisc_rtab_list; 454 (rtab = *rtabp) != NULL; 455 rtabp = &rtab->next) { 456 if (rtab == tab) { 457 *rtabp = rtab->next; 458 kfree(rtab); 459 return; 460 } 461 } 462 } 463 EXPORT_SYMBOL(qdisc_put_rtab); 464 465 static LIST_HEAD(qdisc_stab_list); 466 467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { 468 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, 469 [TCA_STAB_DATA] = { .type = NLA_BINARY }, 470 }; 471 472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, 473 struct netlink_ext_ack *extack) 474 { 475 struct nlattr *tb[TCA_STAB_MAX + 1]; 476 struct qdisc_size_table *stab; 477 struct tc_sizespec *s; 478 unsigned int tsize = 0; 479 u16 *tab = NULL; 480 int err; 481 482 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack); 483 if (err < 0) 484 return ERR_PTR(err); 485 if (!tb[TCA_STAB_BASE]) { 486 NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); 487 return ERR_PTR(-EINVAL); 488 } 489 490 s = nla_data(tb[TCA_STAB_BASE]); 491 492 if (s->tsize > 0) { 493 if (!tb[TCA_STAB_DATA]) { 494 NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); 495 return ERR_PTR(-EINVAL); 496 } 497 tab = nla_data(tb[TCA_STAB_DATA]); 498 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); 499 } 500 501 if (tsize != s->tsize || (!tab && tsize > 0)) { 502 NL_SET_ERR_MSG(extack, "Invalid size of size table"); 503 return ERR_PTR(-EINVAL); 504 } 505 506 list_for_each_entry(stab, &qdisc_stab_list, list) { 507 if (memcmp(&stab->szopts, s, sizeof(*s))) 508 continue; 509 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) 510 continue; 511 stab->refcnt++; 512 return stab; 513 } 514 515 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); 516 if (!stab) 517 return ERR_PTR(-ENOMEM); 518 519 stab->refcnt = 1; 520 stab->szopts = *s; 521 if (tsize > 0) 522 memcpy(stab->data, tab, tsize * sizeof(u16)); 523 524 list_add_tail(&stab->list, &qdisc_stab_list); 525 526 return stab; 527 } 528 529 static void stab_kfree_rcu(struct rcu_head *head) 530 { 531 kfree(container_of(head, struct qdisc_size_table, rcu)); 532 } 533 534 void qdisc_put_stab(struct qdisc_size_table *tab) 535 { 536 if (!tab) 537 return; 538 539 if (--tab->refcnt == 0) { 540 list_del(&tab->list); 541 call_rcu(&tab->rcu, stab_kfree_rcu); 542 } 543 } 544 EXPORT_SYMBOL(qdisc_put_stab); 545 546 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) 547 { 548 struct nlattr *nest; 549 550 nest = nla_nest_start(skb, TCA_STAB); 551 if (nest == NULL) 552 goto nla_put_failure; 553 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) 554 goto nla_put_failure; 555 nla_nest_end(skb, nest); 556 557 return skb->len; 558 559 nla_put_failure: 560 return -1; 561 } 562 563 void __qdisc_calculate_pkt_len(struct sk_buff *skb, 564 const struct qdisc_size_table *stab) 565 { 566 int pkt_len, slot; 567 568 pkt_len = skb->len + stab->szopts.overhead; 569 if (unlikely(!stab->szopts.tsize)) 570 goto out; 571 572 slot = pkt_len + stab->szopts.cell_align; 573 if (unlikely(slot < 0)) 574 slot = 0; 575 576 slot >>= stab->szopts.cell_log; 577 if (likely(slot < stab->szopts.tsize)) 578 pkt_len = stab->data[slot]; 579 else 580 pkt_len = stab->data[stab->szopts.tsize - 1] * 581 (slot / stab->szopts.tsize) + 582 stab->data[slot % stab->szopts.tsize]; 583 584 pkt_len <<= stab->szopts.size_log; 585 out: 586 if (unlikely(pkt_len < 1)) 587 pkt_len = 1; 588 qdisc_skb_cb(skb)->pkt_len = pkt_len; 589 } 590 EXPORT_SYMBOL(__qdisc_calculate_pkt_len); 591 592 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) 593 { 594 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { 595 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", 596 txt, qdisc->ops->id, qdisc->handle >> 16); 597 qdisc->flags |= TCQ_F_WARN_NONWC; 598 } 599 } 600 EXPORT_SYMBOL(qdisc_warn_nonwc); 601 602 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 603 { 604 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 605 timer); 606 607 rcu_read_lock(); 608 __netif_schedule(qdisc_root(wd->qdisc)); 609 rcu_read_unlock(); 610 611 return HRTIMER_NORESTART; 612 } 613 614 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, 615 clockid_t clockid) 616 { 617 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); 618 wd->timer.function = qdisc_watchdog; 619 wd->qdisc = qdisc; 620 } 621 EXPORT_SYMBOL(qdisc_watchdog_init_clockid); 622 623 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 624 { 625 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); 626 } 627 EXPORT_SYMBOL(qdisc_watchdog_init); 628 629 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) 630 { 631 if (test_bit(__QDISC_STATE_DEACTIVATED, 632 &qdisc_root_sleeping(wd->qdisc)->state)) 633 return; 634 635 if (wd->last_expires == expires) 636 return; 637 638 wd->last_expires = expires; 639 hrtimer_start(&wd->timer, 640 ns_to_ktime(expires), 641 HRTIMER_MODE_ABS_PINNED); 642 } 643 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); 644 645 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 646 { 647 hrtimer_cancel(&wd->timer); 648 } 649 EXPORT_SYMBOL(qdisc_watchdog_cancel); 650 651 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) 652 { 653 struct hlist_head *h; 654 unsigned int i; 655 656 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); 657 658 if (h != NULL) { 659 for (i = 0; i < n; i++) 660 INIT_HLIST_HEAD(&h[i]); 661 } 662 return h; 663 } 664 665 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) 666 { 667 struct Qdisc_class_common *cl; 668 struct hlist_node *next; 669 struct hlist_head *nhash, *ohash; 670 unsigned int nsize, nmask, osize; 671 unsigned int i, h; 672 673 /* Rehash when load factor exceeds 0.75 */ 674 if (clhash->hashelems * 4 <= clhash->hashsize * 3) 675 return; 676 nsize = clhash->hashsize * 2; 677 nmask = nsize - 1; 678 nhash = qdisc_class_hash_alloc(nsize); 679 if (nhash == NULL) 680 return; 681 682 ohash = clhash->hash; 683 osize = clhash->hashsize; 684 685 sch_tree_lock(sch); 686 for (i = 0; i < osize; i++) { 687 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { 688 h = qdisc_class_hash(cl->classid, nmask); 689 hlist_add_head(&cl->hnode, &nhash[h]); 690 } 691 } 692 clhash->hash = nhash; 693 clhash->hashsize = nsize; 694 clhash->hashmask = nmask; 695 sch_tree_unlock(sch); 696 697 kvfree(ohash); 698 } 699 EXPORT_SYMBOL(qdisc_class_hash_grow); 700 701 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) 702 { 703 unsigned int size = 4; 704 705 clhash->hash = qdisc_class_hash_alloc(size); 706 if (!clhash->hash) 707 return -ENOMEM; 708 clhash->hashsize = size; 709 clhash->hashmask = size - 1; 710 clhash->hashelems = 0; 711 return 0; 712 } 713 EXPORT_SYMBOL(qdisc_class_hash_init); 714 715 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) 716 { 717 kvfree(clhash->hash); 718 } 719 EXPORT_SYMBOL(qdisc_class_hash_destroy); 720 721 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, 722 struct Qdisc_class_common *cl) 723 { 724 unsigned int h; 725 726 INIT_HLIST_NODE(&cl->hnode); 727 h = qdisc_class_hash(cl->classid, clhash->hashmask); 728 hlist_add_head(&cl->hnode, &clhash->hash[h]); 729 clhash->hashelems++; 730 } 731 EXPORT_SYMBOL(qdisc_class_hash_insert); 732 733 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, 734 struct Qdisc_class_common *cl) 735 { 736 hlist_del(&cl->hnode); 737 clhash->hashelems--; 738 } 739 EXPORT_SYMBOL(qdisc_class_hash_remove); 740 741 /* Allocate an unique handle from space managed by kernel 742 * Possible range is [8000-FFFF]:0000 (0x8000 values) 743 */ 744 static u32 qdisc_alloc_handle(struct net_device *dev) 745 { 746 int i = 0x8000; 747 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 748 749 do { 750 autohandle += TC_H_MAKE(0x10000U, 0); 751 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 752 autohandle = TC_H_MAKE(0x80000000U, 0); 753 if (!qdisc_lookup(dev, autohandle)) 754 return autohandle; 755 cond_resched(); 756 } while (--i > 0); 757 758 return 0; 759 } 760 761 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, 762 unsigned int len) 763 { 764 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; 765 const struct Qdisc_class_ops *cops; 766 unsigned long cl; 767 u32 parentid; 768 bool notify; 769 int drops; 770 771 if (n == 0 && len == 0) 772 return; 773 drops = max_t(int, n, 0); 774 rcu_read_lock(); 775 while ((parentid = sch->parent)) { 776 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) 777 break; 778 779 if (sch->flags & TCQ_F_NOPARENT) 780 break; 781 /* Notify parent qdisc only if child qdisc becomes empty. 782 * 783 * If child was empty even before update then backlog 784 * counter is screwed and we skip notification because 785 * parent class is already passive. 786 * 787 * If the original child was offloaded then it is allowed 788 * to be seem as empty, so the parent is notified anyway. 789 */ 790 notify = !sch->q.qlen && !WARN_ON_ONCE(!n && 791 !qdisc_is_offloaded); 792 /* TODO: perform the search on a per txq basis */ 793 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); 794 if (sch == NULL) { 795 WARN_ON_ONCE(parentid != TC_H_ROOT); 796 break; 797 } 798 cops = sch->ops->cl_ops; 799 if (notify && cops->qlen_notify) { 800 cl = cops->find(sch, parentid); 801 cops->qlen_notify(sch, cl); 802 } 803 sch->q.qlen -= n; 804 sch->qstats.backlog -= len; 805 __qdisc_qstats_drop(sch, drops); 806 } 807 rcu_read_unlock(); 808 } 809 EXPORT_SYMBOL(qdisc_tree_reduce_backlog); 810 811 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, 812 void *type_data) 813 { 814 struct net_device *dev = qdisc_dev(sch); 815 int err; 816 817 sch->flags &= ~TCQ_F_OFFLOADED; 818 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 819 return 0; 820 821 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 822 if (err == -EOPNOTSUPP) 823 return 0; 824 825 if (!err) 826 sch->flags |= TCQ_F_OFFLOADED; 827 828 return err; 829 } 830 EXPORT_SYMBOL(qdisc_offload_dump_helper); 831 832 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, 833 struct Qdisc *new, struct Qdisc *old, 834 enum tc_setup_type type, void *type_data, 835 struct netlink_ext_ack *extack) 836 { 837 bool any_qdisc_is_offloaded; 838 int err; 839 840 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) 841 return; 842 843 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); 844 845 /* Don't report error if the graft is part of destroy operation. */ 846 if (!err || !new || new == &noop_qdisc) 847 return; 848 849 /* Don't report error if the parent, the old child and the new 850 * one are not offloaded. 851 */ 852 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; 853 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; 854 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; 855 856 if (any_qdisc_is_offloaded) 857 NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); 858 } 859 EXPORT_SYMBOL(qdisc_offload_graft_helper); 860 861 static void qdisc_offload_graft_root(struct net_device *dev, 862 struct Qdisc *new, struct Qdisc *old, 863 struct netlink_ext_ack *extack) 864 { 865 struct tc_root_qopt_offload graft_offload = { 866 .command = TC_ROOT_GRAFT, 867 .handle = new ? new->handle : 0, 868 .ingress = (new && new->flags & TCQ_F_INGRESS) || 869 (old && old->flags & TCQ_F_INGRESS), 870 }; 871 872 qdisc_offload_graft_helper(dev, NULL, new, old, 873 TC_SETUP_ROOT_QDISC, &graft_offload, extack); 874 } 875 876 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 877 u32 portid, u32 seq, u16 flags, int event) 878 { 879 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; 880 struct gnet_stats_queue __percpu *cpu_qstats = NULL; 881 struct tcmsg *tcm; 882 struct nlmsghdr *nlh; 883 unsigned char *b = skb_tail_pointer(skb); 884 struct gnet_dump d; 885 struct qdisc_size_table *stab; 886 u32 block_index; 887 __u32 qlen; 888 889 cond_resched(); 890 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 891 if (!nlh) 892 goto out_nlmsg_trim; 893 tcm = nlmsg_data(nlh); 894 tcm->tcm_family = AF_UNSPEC; 895 tcm->tcm__pad1 = 0; 896 tcm->tcm__pad2 = 0; 897 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 898 tcm->tcm_parent = clid; 899 tcm->tcm_handle = q->handle; 900 tcm->tcm_info = refcount_read(&q->refcnt); 901 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 902 goto nla_put_failure; 903 if (q->ops->ingress_block_get) { 904 block_index = q->ops->ingress_block_get(q); 905 if (block_index && 906 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) 907 goto nla_put_failure; 908 } 909 if (q->ops->egress_block_get) { 910 block_index = q->ops->egress_block_get(q); 911 if (block_index && 912 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) 913 goto nla_put_failure; 914 } 915 if (q->ops->dump && q->ops->dump(q, skb) < 0) 916 goto nla_put_failure; 917 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) 918 goto nla_put_failure; 919 qlen = qdisc_qlen_sum(q); 920 921 stab = rtnl_dereference(q->stab); 922 if (stab && qdisc_dump_stab(skb, stab) < 0) 923 goto nla_put_failure; 924 925 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 926 NULL, &d, TCA_PAD) < 0) 927 goto nla_put_failure; 928 929 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 930 goto nla_put_failure; 931 932 if (qdisc_is_percpu_stats(q)) { 933 cpu_bstats = q->cpu_bstats; 934 cpu_qstats = q->cpu_qstats; 935 } 936 937 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), 938 &d, cpu_bstats, &q->bstats) < 0 || 939 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 940 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) 941 goto nla_put_failure; 942 943 if (gnet_stats_finish_copy(&d) < 0) 944 goto nla_put_failure; 945 946 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 947 return skb->len; 948 949 out_nlmsg_trim: 950 nla_put_failure: 951 nlmsg_trim(skb, b); 952 return -1; 953 } 954 955 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) 956 { 957 if (q->flags & TCQ_F_BUILTIN) 958 return true; 959 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) 960 return true; 961 962 return false; 963 } 964 965 static int qdisc_notify(struct net *net, struct sk_buff *oskb, 966 struct nlmsghdr *n, u32 clid, 967 struct Qdisc *old, struct Qdisc *new) 968 { 969 struct sk_buff *skb; 970 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 971 972 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 973 if (!skb) 974 return -ENOBUFS; 975 976 if (old && !tc_qdisc_dump_ignore(old, false)) { 977 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 978 0, RTM_DELQDISC) < 0) 979 goto err_out; 980 } 981 if (new && !tc_qdisc_dump_ignore(new, false)) { 982 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, 983 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 984 goto err_out; 985 } 986 987 if (skb->len) 988 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 989 n->nlmsg_flags & NLM_F_ECHO); 990 991 err_out: 992 kfree_skb(skb); 993 return -EINVAL; 994 } 995 996 static void notify_and_destroy(struct net *net, struct sk_buff *skb, 997 struct nlmsghdr *n, u32 clid, 998 struct Qdisc *old, struct Qdisc *new) 999 { 1000 if (new || old) 1001 qdisc_notify(net, skb, n, clid, old, new); 1002 1003 if (old) 1004 qdisc_put(old); 1005 } 1006 1007 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 1008 * to device "dev". 1009 * 1010 * When appropriate send a netlink notification using 'skb' 1011 * and "n". 1012 * 1013 * On success, destroy old qdisc. 1014 */ 1015 1016 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 1017 struct sk_buff *skb, struct nlmsghdr *n, u32 classid, 1018 struct Qdisc *new, struct Qdisc *old, 1019 struct netlink_ext_ack *extack) 1020 { 1021 struct Qdisc *q = old; 1022 struct net *net = dev_net(dev); 1023 1024 if (parent == NULL) { 1025 unsigned int i, num_q, ingress; 1026 1027 ingress = 0; 1028 num_q = dev->num_tx_queues; 1029 if ((q && q->flags & TCQ_F_INGRESS) || 1030 (new && new->flags & TCQ_F_INGRESS)) { 1031 num_q = 1; 1032 ingress = 1; 1033 if (!dev_ingress_queue(dev)) { 1034 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); 1035 return -ENOENT; 1036 } 1037 } 1038 1039 if (dev->flags & IFF_UP) 1040 dev_deactivate(dev); 1041 1042 qdisc_offload_graft_root(dev, new, old, extack); 1043 1044 if (new && new->ops->attach) 1045 goto skip; 1046 1047 for (i = 0; i < num_q; i++) { 1048 struct netdev_queue *dev_queue = dev_ingress_queue(dev); 1049 1050 if (!ingress) 1051 dev_queue = netdev_get_tx_queue(dev, i); 1052 1053 old = dev_graft_qdisc(dev_queue, new); 1054 if (new && i > 0) 1055 qdisc_refcount_inc(new); 1056 1057 if (!ingress) 1058 qdisc_put(old); 1059 } 1060 1061 skip: 1062 if (!ingress) { 1063 notify_and_destroy(net, skb, n, classid, 1064 dev->qdisc, new); 1065 if (new && !new->ops->attach) 1066 qdisc_refcount_inc(new); 1067 dev->qdisc = new ? : &noop_qdisc; 1068 1069 if (new && new->ops->attach) 1070 new->ops->attach(new); 1071 } else { 1072 notify_and_destroy(net, skb, n, classid, old, new); 1073 } 1074 1075 if (dev->flags & IFF_UP) 1076 dev_activate(dev); 1077 } else { 1078 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 1079 unsigned long cl; 1080 int err; 1081 1082 /* Only support running class lockless if parent is lockless */ 1083 if (new && (new->flags & TCQ_F_NOLOCK) && 1084 parent && !(parent->flags & TCQ_F_NOLOCK)) 1085 new->flags &= ~TCQ_F_NOLOCK; 1086 1087 if (!cops || !cops->graft) 1088 return -EOPNOTSUPP; 1089 1090 cl = cops->find(parent, classid); 1091 if (!cl) { 1092 NL_SET_ERR_MSG(extack, "Specified class not found"); 1093 return -ENOENT; 1094 } 1095 1096 err = cops->graft(parent, cl, new, &old, extack); 1097 if (err) 1098 return err; 1099 notify_and_destroy(net, skb, n, classid, old, new); 1100 } 1101 return 0; 1102 } 1103 1104 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, 1105 struct netlink_ext_ack *extack) 1106 { 1107 u32 block_index; 1108 1109 if (tca[TCA_INGRESS_BLOCK]) { 1110 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); 1111 1112 if (!block_index) { 1113 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); 1114 return -EINVAL; 1115 } 1116 if (!sch->ops->ingress_block_set) { 1117 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); 1118 return -EOPNOTSUPP; 1119 } 1120 sch->ops->ingress_block_set(sch, block_index); 1121 } 1122 if (tca[TCA_EGRESS_BLOCK]) { 1123 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); 1124 1125 if (!block_index) { 1126 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); 1127 return -EINVAL; 1128 } 1129 if (!sch->ops->egress_block_set) { 1130 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); 1131 return -EOPNOTSUPP; 1132 } 1133 sch->ops->egress_block_set(sch, block_index); 1134 } 1135 return 0; 1136 } 1137 1138 /* 1139 Allocate and initialize new qdisc. 1140 1141 Parameters are passed via opt. 1142 */ 1143 1144 static struct Qdisc *qdisc_create(struct net_device *dev, 1145 struct netdev_queue *dev_queue, 1146 struct Qdisc *p, u32 parent, u32 handle, 1147 struct nlattr **tca, int *errp, 1148 struct netlink_ext_ack *extack) 1149 { 1150 int err; 1151 struct nlattr *kind = tca[TCA_KIND]; 1152 struct Qdisc *sch; 1153 struct Qdisc_ops *ops; 1154 struct qdisc_size_table *stab; 1155 1156 ops = qdisc_lookup_ops(kind); 1157 #ifdef CONFIG_MODULES 1158 if (ops == NULL && kind != NULL) { 1159 char name[IFNAMSIZ]; 1160 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { 1161 /* We dropped the RTNL semaphore in order to 1162 * perform the module load. So, even if we 1163 * succeeded in loading the module we have to 1164 * tell the caller to replay the request. We 1165 * indicate this using -EAGAIN. 1166 * We replay the request because the device may 1167 * go away in the mean time. 1168 */ 1169 rtnl_unlock(); 1170 request_module("sch_%s", name); 1171 rtnl_lock(); 1172 ops = qdisc_lookup_ops(kind); 1173 if (ops != NULL) { 1174 /* We will try again qdisc_lookup_ops, 1175 * so don't keep a reference. 1176 */ 1177 module_put(ops->owner); 1178 err = -EAGAIN; 1179 goto err_out; 1180 } 1181 } 1182 } 1183 #endif 1184 1185 err = -ENOENT; 1186 if (!ops) { 1187 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1188 goto err_out; 1189 } 1190 1191 sch = qdisc_alloc(dev_queue, ops, extack); 1192 if (IS_ERR(sch)) { 1193 err = PTR_ERR(sch); 1194 goto err_out2; 1195 } 1196 1197 sch->parent = parent; 1198 1199 if (handle == TC_H_INGRESS) { 1200 sch->flags |= TCQ_F_INGRESS; 1201 handle = TC_H_MAKE(TC_H_INGRESS, 0); 1202 } else { 1203 if (handle == 0) { 1204 handle = qdisc_alloc_handle(dev); 1205 err = -ENOMEM; 1206 if (handle == 0) 1207 goto err_out3; 1208 } 1209 if (!netif_is_multiqueue(dev)) 1210 sch->flags |= TCQ_F_ONETXQUEUE; 1211 } 1212 1213 sch->handle = handle; 1214 1215 /* This exist to keep backward compatible with a userspace 1216 * loophole, what allowed userspace to get IFF_NO_QUEUE 1217 * facility on older kernels by setting tx_queue_len=0 (prior 1218 * to qdisc init), and then forgot to reinit tx_queue_len 1219 * before again attaching a qdisc. 1220 */ 1221 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { 1222 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; 1223 netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); 1224 } 1225 1226 err = qdisc_block_indexes_set(sch, tca, extack); 1227 if (err) 1228 goto err_out3; 1229 1230 if (ops->init) { 1231 err = ops->init(sch, tca[TCA_OPTIONS], extack); 1232 if (err != 0) 1233 goto err_out5; 1234 } 1235 1236 if (tca[TCA_STAB]) { 1237 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1238 if (IS_ERR(stab)) { 1239 err = PTR_ERR(stab); 1240 goto err_out4; 1241 } 1242 rcu_assign_pointer(sch->stab, stab); 1243 } 1244 if (tca[TCA_RATE]) { 1245 seqcount_t *running; 1246 1247 err = -EOPNOTSUPP; 1248 if (sch->flags & TCQ_F_MQROOT) { 1249 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); 1250 goto err_out4; 1251 } 1252 1253 if (sch->parent != TC_H_ROOT && 1254 !(sch->flags & TCQ_F_INGRESS) && 1255 (!p || !(p->flags & TCQ_F_MQROOT))) 1256 running = qdisc_root_sleeping_running(sch); 1257 else 1258 running = &sch->running; 1259 1260 err = gen_new_estimator(&sch->bstats, 1261 sch->cpu_bstats, 1262 &sch->rate_est, 1263 NULL, 1264 running, 1265 tca[TCA_RATE]); 1266 if (err) { 1267 NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); 1268 goto err_out4; 1269 } 1270 } 1271 1272 qdisc_hash_add(sch, false); 1273 1274 return sch; 1275 1276 err_out5: 1277 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ 1278 if (ops->destroy) 1279 ops->destroy(sch); 1280 err_out3: 1281 dev_put(dev); 1282 qdisc_free(sch); 1283 err_out2: 1284 module_put(ops->owner); 1285 err_out: 1286 *errp = err; 1287 return NULL; 1288 1289 err_out4: 1290 /* 1291 * Any broken qdiscs that would require a ops->reset() here? 1292 * The qdisc was never in action so it shouldn't be necessary. 1293 */ 1294 qdisc_put_stab(rtnl_dereference(sch->stab)); 1295 if (ops->destroy) 1296 ops->destroy(sch); 1297 goto err_out3; 1298 } 1299 1300 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, 1301 struct netlink_ext_ack *extack) 1302 { 1303 struct qdisc_size_table *ostab, *stab = NULL; 1304 int err = 0; 1305 1306 if (tca[TCA_OPTIONS]) { 1307 if (!sch->ops->change) { 1308 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); 1309 return -EINVAL; 1310 } 1311 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 1312 NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); 1313 return -EOPNOTSUPP; 1314 } 1315 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); 1316 if (err) 1317 return err; 1318 } 1319 1320 if (tca[TCA_STAB]) { 1321 stab = qdisc_get_stab(tca[TCA_STAB], extack); 1322 if (IS_ERR(stab)) 1323 return PTR_ERR(stab); 1324 } 1325 1326 ostab = rtnl_dereference(sch->stab); 1327 rcu_assign_pointer(sch->stab, stab); 1328 qdisc_put_stab(ostab); 1329 1330 if (tca[TCA_RATE]) { 1331 /* NB: ignores errors from replace_estimator 1332 because change can't be undone. */ 1333 if (sch->flags & TCQ_F_MQROOT) 1334 goto out; 1335 gen_replace_estimator(&sch->bstats, 1336 sch->cpu_bstats, 1337 &sch->rate_est, 1338 NULL, 1339 qdisc_root_sleeping_running(sch), 1340 tca[TCA_RATE]); 1341 } 1342 out: 1343 return 0; 1344 } 1345 1346 struct check_loop_arg { 1347 struct qdisc_walker w; 1348 struct Qdisc *p; 1349 int depth; 1350 }; 1351 1352 static int check_loop_fn(struct Qdisc *q, unsigned long cl, 1353 struct qdisc_walker *w); 1354 1355 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 1356 { 1357 struct check_loop_arg arg; 1358 1359 if (q->ops->cl_ops == NULL) 1360 return 0; 1361 1362 arg.w.stop = arg.w.skip = arg.w.count = 0; 1363 arg.w.fn = check_loop_fn; 1364 arg.depth = depth; 1365 arg.p = p; 1366 q->ops->cl_ops->walk(q, &arg.w); 1367 return arg.w.stop ? -ELOOP : 0; 1368 } 1369 1370 static int 1371 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 1372 { 1373 struct Qdisc *leaf; 1374 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1375 struct check_loop_arg *arg = (struct check_loop_arg *)w; 1376 1377 leaf = cops->leaf(q, cl); 1378 if (leaf) { 1379 if (leaf == arg->p || arg->depth > 7) 1380 return -ELOOP; 1381 return check_loop(leaf, arg->p, arg->depth + 1); 1382 } 1383 return 0; 1384 } 1385 1386 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { 1387 [TCA_KIND] = { .type = NLA_STRING }, 1388 [TCA_RATE] = { .type = NLA_BINARY, 1389 .len = sizeof(struct tc_estimator) }, 1390 [TCA_STAB] = { .type = NLA_NESTED }, 1391 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, 1392 [TCA_CHAIN] = { .type = NLA_U32 }, 1393 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, 1394 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, 1395 }; 1396 1397 /* 1398 * Delete/get qdisc. 1399 */ 1400 1401 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1402 struct netlink_ext_ack *extack) 1403 { 1404 struct net *net = sock_net(skb->sk); 1405 struct tcmsg *tcm = nlmsg_data(n); 1406 struct nlattr *tca[TCA_MAX + 1]; 1407 struct net_device *dev; 1408 u32 clid; 1409 struct Qdisc *q = NULL; 1410 struct Qdisc *p = NULL; 1411 int err; 1412 1413 if ((n->nlmsg_type != RTM_GETQDISC) && 1414 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) 1415 return -EPERM; 1416 1417 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, 1418 extack); 1419 if (err < 0) 1420 return err; 1421 1422 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1423 if (!dev) 1424 return -ENODEV; 1425 1426 clid = tcm->tcm_parent; 1427 if (clid) { 1428 if (clid != TC_H_ROOT) { 1429 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 1430 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1431 if (!p) { 1432 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); 1433 return -ENOENT; 1434 } 1435 q = qdisc_leaf(p, clid); 1436 } else if (dev_ingress_queue(dev)) { 1437 q = dev_ingress_queue(dev)->qdisc_sleeping; 1438 } 1439 } else { 1440 q = dev->qdisc; 1441 } 1442 if (!q) { 1443 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); 1444 return -ENOENT; 1445 } 1446 1447 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { 1448 NL_SET_ERR_MSG(extack, "Invalid handle"); 1449 return -EINVAL; 1450 } 1451 } else { 1452 q = qdisc_lookup(dev, tcm->tcm_handle); 1453 if (!q) { 1454 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); 1455 return -ENOENT; 1456 } 1457 } 1458 1459 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1460 NL_SET_ERR_MSG(extack, "Invalid qdisc name"); 1461 return -EINVAL; 1462 } 1463 1464 if (n->nlmsg_type == RTM_DELQDISC) { 1465 if (!clid) { 1466 NL_SET_ERR_MSG(extack, "Classid cannot be zero"); 1467 return -EINVAL; 1468 } 1469 if (q->handle == 0) { 1470 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); 1471 return -ENOENT; 1472 } 1473 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); 1474 if (err != 0) 1475 return err; 1476 } else { 1477 qdisc_notify(net, skb, n, clid, NULL, q); 1478 } 1479 return 0; 1480 } 1481 1482 /* 1483 * Create/change qdisc. 1484 */ 1485 1486 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, 1487 struct netlink_ext_ack *extack) 1488 { 1489 struct net *net = sock_net(skb->sk); 1490 struct tcmsg *tcm; 1491 struct nlattr *tca[TCA_MAX + 1]; 1492 struct net_device *dev; 1493 u32 clid; 1494 struct Qdisc *q, *p; 1495 int err; 1496 1497 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) 1498 return -EPERM; 1499 1500 replay: 1501 /* Reinit, just in case something touches this. */ 1502 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, 1503 extack); 1504 if (err < 0) 1505 return err; 1506 1507 tcm = nlmsg_data(n); 1508 clid = tcm->tcm_parent; 1509 q = p = NULL; 1510 1511 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1512 if (!dev) 1513 return -ENODEV; 1514 1515 1516 if (clid) { 1517 if (clid != TC_H_ROOT) { 1518 if (clid != TC_H_INGRESS) { 1519 p = qdisc_lookup(dev, TC_H_MAJ(clid)); 1520 if (!p) { 1521 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); 1522 return -ENOENT; 1523 } 1524 q = qdisc_leaf(p, clid); 1525 } else if (dev_ingress_queue_create(dev)) { 1526 q = dev_ingress_queue(dev)->qdisc_sleeping; 1527 } 1528 } else { 1529 q = dev->qdisc; 1530 } 1531 1532 /* It may be default qdisc, ignore it */ 1533 if (q && q->handle == 0) 1534 q = NULL; 1535 1536 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1537 if (tcm->tcm_handle) { 1538 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { 1539 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); 1540 return -EEXIST; 1541 } 1542 if (TC_H_MIN(tcm->tcm_handle)) { 1543 NL_SET_ERR_MSG(extack, "Invalid minor handle"); 1544 return -EINVAL; 1545 } 1546 q = qdisc_lookup(dev, tcm->tcm_handle); 1547 if (!q) 1548 goto create_n_graft; 1549 if (n->nlmsg_flags & NLM_F_EXCL) { 1550 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); 1551 return -EEXIST; 1552 } 1553 if (tca[TCA_KIND] && 1554 nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1555 NL_SET_ERR_MSG(extack, "Invalid qdisc name"); 1556 return -EINVAL; 1557 } 1558 if (q == p || 1559 (p && check_loop(q, p, 0))) { 1560 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); 1561 return -ELOOP; 1562 } 1563 qdisc_refcount_inc(q); 1564 goto graft; 1565 } else { 1566 if (!q) 1567 goto create_n_graft; 1568 1569 /* This magic test requires explanation. 1570 * 1571 * We know, that some child q is already 1572 * attached to this parent and have choice: 1573 * either to change it or to create/graft new one. 1574 * 1575 * 1. We are allowed to create/graft only 1576 * if CREATE and REPLACE flags are set. 1577 * 1578 * 2. If EXCL is set, requestor wanted to say, 1579 * that qdisc tcm_handle is not expected 1580 * to exist, so that we choose create/graft too. 1581 * 1582 * 3. The last case is when no flags are set. 1583 * Alas, it is sort of hole in API, we 1584 * cannot decide what to do unambiguously. 1585 * For now we select create/graft, if 1586 * user gave KIND, which does not match existing. 1587 */ 1588 if ((n->nlmsg_flags & NLM_F_CREATE) && 1589 (n->nlmsg_flags & NLM_F_REPLACE) && 1590 ((n->nlmsg_flags & NLM_F_EXCL) || 1591 (tca[TCA_KIND] && 1592 nla_strcmp(tca[TCA_KIND], q->ops->id)))) 1593 goto create_n_graft; 1594 } 1595 } 1596 } else { 1597 if (!tcm->tcm_handle) { 1598 NL_SET_ERR_MSG(extack, "Handle cannot be zero"); 1599 return -EINVAL; 1600 } 1601 q = qdisc_lookup(dev, tcm->tcm_handle); 1602 } 1603 1604 /* Change qdisc parameters */ 1605 if (!q) { 1606 NL_SET_ERR_MSG(extack, "Specified qdisc not found"); 1607 return -ENOENT; 1608 } 1609 if (n->nlmsg_flags & NLM_F_EXCL) { 1610 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); 1611 return -EEXIST; 1612 } 1613 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { 1614 NL_SET_ERR_MSG(extack, "Invalid qdisc name"); 1615 return -EINVAL; 1616 } 1617 err = qdisc_change(q, tca, extack); 1618 if (err == 0) 1619 qdisc_notify(net, skb, n, clid, NULL, q); 1620 return err; 1621 1622 create_n_graft: 1623 if (!(n->nlmsg_flags & NLM_F_CREATE)) { 1624 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); 1625 return -ENOENT; 1626 } 1627 if (clid == TC_H_INGRESS) { 1628 if (dev_ingress_queue(dev)) { 1629 q = qdisc_create(dev, dev_ingress_queue(dev), p, 1630 tcm->tcm_parent, tcm->tcm_parent, 1631 tca, &err, extack); 1632 } else { 1633 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); 1634 err = -ENOENT; 1635 } 1636 } else { 1637 struct netdev_queue *dev_queue; 1638 1639 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) 1640 dev_queue = p->ops->cl_ops->select_queue(p, tcm); 1641 else if (p) 1642 dev_queue = p->dev_queue; 1643 else 1644 dev_queue = netdev_get_tx_queue(dev, 0); 1645 1646 q = qdisc_create(dev, dev_queue, p, 1647 tcm->tcm_parent, tcm->tcm_handle, 1648 tca, &err, extack); 1649 } 1650 if (q == NULL) { 1651 if (err == -EAGAIN) 1652 goto replay; 1653 return err; 1654 } 1655 1656 graft: 1657 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); 1658 if (err) { 1659 if (q) 1660 qdisc_put(q); 1661 return err; 1662 } 1663 1664 return 0; 1665 } 1666 1667 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, 1668 struct netlink_callback *cb, 1669 int *q_idx_p, int s_q_idx, bool recur, 1670 bool dump_invisible) 1671 { 1672 int ret = 0, q_idx = *q_idx_p; 1673 struct Qdisc *q; 1674 int b; 1675 1676 if (!root) 1677 return 0; 1678 1679 q = root; 1680 if (q_idx < s_q_idx) { 1681 q_idx++; 1682 } else { 1683 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1684 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1685 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1686 RTM_NEWQDISC) <= 0) 1687 goto done; 1688 q_idx++; 1689 } 1690 1691 /* If dumping singletons, there is no qdisc_dev(root) and the singleton 1692 * itself has already been dumped. 1693 * 1694 * If we've already dumped the top-level (ingress) qdisc above and the global 1695 * qdisc hashtable, we don't want to hit it again 1696 */ 1697 if (!qdisc_dev(root) || !recur) 1698 goto out; 1699 1700 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 1701 if (q_idx < s_q_idx) { 1702 q_idx++; 1703 continue; 1704 } 1705 if (!tc_qdisc_dump_ignore(q, dump_invisible) && 1706 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, 1707 cb->nlh->nlmsg_seq, NLM_F_MULTI, 1708 RTM_NEWQDISC) <= 0) 1709 goto done; 1710 q_idx++; 1711 } 1712 1713 out: 1714 *q_idx_p = q_idx; 1715 return ret; 1716 done: 1717 ret = -1; 1718 goto out; 1719 } 1720 1721 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 1722 { 1723 struct net *net = sock_net(skb->sk); 1724 int idx, q_idx; 1725 int s_idx, s_q_idx; 1726 struct net_device *dev; 1727 const struct nlmsghdr *nlh = cb->nlh; 1728 struct nlattr *tca[TCA_MAX + 1]; 1729 int err; 1730 1731 s_idx = cb->args[0]; 1732 s_q_idx = q_idx = cb->args[1]; 1733 1734 idx = 0; 1735 ASSERT_RTNL(); 1736 1737 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, 1738 rtm_tca_policy, cb->extack); 1739 if (err < 0) 1740 return err; 1741 1742 for_each_netdev(net, dev) { 1743 struct netdev_queue *dev_queue; 1744 1745 if (idx < s_idx) 1746 goto cont; 1747 if (idx > s_idx) 1748 s_q_idx = 0; 1749 q_idx = 0; 1750 1751 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, 1752 true, tca[TCA_DUMP_INVISIBLE]) < 0) 1753 goto done; 1754 1755 dev_queue = dev_ingress_queue(dev); 1756 if (dev_queue && 1757 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, 1758 &q_idx, s_q_idx, false, 1759 tca[TCA_DUMP_INVISIBLE]) < 0) 1760 goto done; 1761 1762 cont: 1763 idx++; 1764 } 1765 1766 done: 1767 cb->args[0] = idx; 1768 cb->args[1] = q_idx; 1769 1770 return skb->len; 1771 } 1772 1773 1774 1775 /************************************************ 1776 * Traffic classes manipulation. * 1777 ************************************************/ 1778 1779 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1780 unsigned long cl, 1781 u32 portid, u32 seq, u16 flags, int event) 1782 { 1783 struct tcmsg *tcm; 1784 struct nlmsghdr *nlh; 1785 unsigned char *b = skb_tail_pointer(skb); 1786 struct gnet_dump d; 1787 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1788 1789 cond_resched(); 1790 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); 1791 if (!nlh) 1792 goto out_nlmsg_trim; 1793 tcm = nlmsg_data(nlh); 1794 tcm->tcm_family = AF_UNSPEC; 1795 tcm->tcm__pad1 = 0; 1796 tcm->tcm__pad2 = 0; 1797 tcm->tcm_ifindex = qdisc_dev(q)->ifindex; 1798 tcm->tcm_parent = q->handle; 1799 tcm->tcm_handle = q->handle; 1800 tcm->tcm_info = 0; 1801 if (nla_put_string(skb, TCA_KIND, q->ops->id)) 1802 goto nla_put_failure; 1803 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1804 goto nla_put_failure; 1805 1806 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1807 NULL, &d, TCA_PAD) < 0) 1808 goto nla_put_failure; 1809 1810 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1811 goto nla_put_failure; 1812 1813 if (gnet_stats_finish_copy(&d) < 0) 1814 goto nla_put_failure; 1815 1816 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 1817 return skb->len; 1818 1819 out_nlmsg_trim: 1820 nla_put_failure: 1821 nlmsg_trim(skb, b); 1822 return -1; 1823 } 1824 1825 static int tclass_notify(struct net *net, struct sk_buff *oskb, 1826 struct nlmsghdr *n, struct Qdisc *q, 1827 unsigned long cl, int event) 1828 { 1829 struct sk_buff *skb; 1830 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1831 1832 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1833 if (!skb) 1834 return -ENOBUFS; 1835 1836 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { 1837 kfree_skb(skb); 1838 return -EINVAL; 1839 } 1840 1841 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1842 n->nlmsg_flags & NLM_F_ECHO); 1843 } 1844 1845 static int tclass_del_notify(struct net *net, 1846 const struct Qdisc_class_ops *cops, 1847 struct sk_buff *oskb, struct nlmsghdr *n, 1848 struct Qdisc *q, unsigned long cl) 1849 { 1850 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; 1851 struct sk_buff *skb; 1852 int err = 0; 1853 1854 if (!cops->delete) 1855 return -EOPNOTSUPP; 1856 1857 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1858 if (!skb) 1859 return -ENOBUFS; 1860 1861 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, 1862 RTM_DELTCLASS) < 0) { 1863 kfree_skb(skb); 1864 return -EINVAL; 1865 } 1866 1867 err = cops->delete(q, cl); 1868 if (err) { 1869 kfree_skb(skb); 1870 return err; 1871 } 1872 1873 return rtnetlink_send(skb, net, portid, RTNLGRP_TC, 1874 n->nlmsg_flags & NLM_F_ECHO); 1875 } 1876 1877 #ifdef CONFIG_NET_CLS 1878 1879 struct tcf_bind_args { 1880 struct tcf_walker w; 1881 u32 classid; 1882 unsigned long cl; 1883 }; 1884 1885 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) 1886 { 1887 struct tcf_bind_args *a = (void *)arg; 1888 1889 if (tp->ops->bind_class) { 1890 struct Qdisc *q = tcf_block_q(tp->chain->block); 1891 1892 sch_tree_lock(q); 1893 tp->ops->bind_class(n, a->classid, a->cl); 1894 sch_tree_unlock(q); 1895 } 1896 return 0; 1897 } 1898 1899 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 1900 unsigned long new_cl) 1901 { 1902 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 1903 struct tcf_block *block; 1904 struct tcf_chain *chain; 1905 unsigned long cl; 1906 1907 cl = cops->find(q, portid); 1908 if (!cl) 1909 return; 1910 block = cops->tcf_block(q, cl, NULL); 1911 if (!block) 1912 return; 1913 list_for_each_entry(chain, &block->chain_list, list) { 1914 struct tcf_proto *tp; 1915 1916 for (tp = rtnl_dereference(chain->filter_chain); 1917 tp; tp = rtnl_dereference(tp->next)) { 1918 struct tcf_bind_args arg = {}; 1919 1920 arg.w.fn = tcf_node_bind; 1921 arg.classid = clid; 1922 arg.cl = new_cl; 1923 tp->ops->walk(tp, &arg.w); 1924 } 1925 } 1926 } 1927 1928 #else 1929 1930 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, 1931 unsigned long new_cl) 1932 { 1933 } 1934 1935 #endif 1936 1937 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, 1938 struct netlink_ext_ack *extack) 1939 { 1940 struct net *net = sock_net(skb->sk); 1941 struct tcmsg *tcm = nlmsg_data(n); 1942 struct nlattr *tca[TCA_MAX + 1]; 1943 struct net_device *dev; 1944 struct Qdisc *q = NULL; 1945 const struct Qdisc_class_ops *cops; 1946 unsigned long cl = 0; 1947 unsigned long new_cl; 1948 u32 portid; 1949 u32 clid; 1950 u32 qid; 1951 int err; 1952 1953 if ((n->nlmsg_type != RTM_GETTCLASS) && 1954 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) 1955 return -EPERM; 1956 1957 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, 1958 extack); 1959 if (err < 0) 1960 return err; 1961 1962 dev = __dev_get_by_index(net, tcm->tcm_ifindex); 1963 if (!dev) 1964 return -ENODEV; 1965 1966 /* 1967 parent == TC_H_UNSPEC - unspecified parent. 1968 parent == TC_H_ROOT - class is root, which has no parent. 1969 parent == X:0 - parent is root class. 1970 parent == X:Y - parent is a node in hierarchy. 1971 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 1972 1973 handle == 0:0 - generate handle from kernel pool. 1974 handle == 0:Y - class is X:Y, where X:0 is qdisc. 1975 handle == X:Y - clear. 1976 handle == X:0 - root class. 1977 */ 1978 1979 /* Step 1. Determine qdisc handle X:0 */ 1980 1981 portid = tcm->tcm_parent; 1982 clid = tcm->tcm_handle; 1983 qid = TC_H_MAJ(clid); 1984 1985 if (portid != TC_H_ROOT) { 1986 u32 qid1 = TC_H_MAJ(portid); 1987 1988 if (qid && qid1) { 1989 /* If both majors are known, they must be identical. */ 1990 if (qid != qid1) 1991 return -EINVAL; 1992 } else if (qid1) { 1993 qid = qid1; 1994 } else if (qid == 0) 1995 qid = dev->qdisc->handle; 1996 1997 /* Now qid is genuine qdisc handle consistent 1998 * both with parent and child. 1999 * 2000 * TC_H_MAJ(portid) still may be unspecified, complete it now. 2001 */ 2002 if (portid) 2003 portid = TC_H_MAKE(qid, portid); 2004 } else { 2005 if (qid == 0) 2006 qid = dev->qdisc->handle; 2007 } 2008 2009 /* OK. Locate qdisc */ 2010 q = qdisc_lookup(dev, qid); 2011 if (!q) 2012 return -ENOENT; 2013 2014 /* An check that it supports classes */ 2015 cops = q->ops->cl_ops; 2016 if (cops == NULL) 2017 return -EINVAL; 2018 2019 /* Now try to get class */ 2020 if (clid == 0) { 2021 if (portid == TC_H_ROOT) 2022 clid = qid; 2023 } else 2024 clid = TC_H_MAKE(qid, clid); 2025 2026 if (clid) 2027 cl = cops->find(q, clid); 2028 2029 if (cl == 0) { 2030 err = -ENOENT; 2031 if (n->nlmsg_type != RTM_NEWTCLASS || 2032 !(n->nlmsg_flags & NLM_F_CREATE)) 2033 goto out; 2034 } else { 2035 switch (n->nlmsg_type) { 2036 case RTM_NEWTCLASS: 2037 err = -EEXIST; 2038 if (n->nlmsg_flags & NLM_F_EXCL) 2039 goto out; 2040 break; 2041 case RTM_DELTCLASS: 2042 err = tclass_del_notify(net, cops, skb, n, q, cl); 2043 /* Unbind the class with flilters with 0 */ 2044 tc_bind_tclass(q, portid, clid, 0); 2045 goto out; 2046 case RTM_GETTCLASS: 2047 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); 2048 goto out; 2049 default: 2050 err = -EINVAL; 2051 goto out; 2052 } 2053 } 2054 2055 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { 2056 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); 2057 return -EOPNOTSUPP; 2058 } 2059 2060 new_cl = cl; 2061 err = -EOPNOTSUPP; 2062 if (cops->change) 2063 err = cops->change(q, clid, portid, tca, &new_cl, extack); 2064 if (err == 0) { 2065 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); 2066 /* We just create a new class, need to do reverse binding. */ 2067 if (cl != new_cl) 2068 tc_bind_tclass(q, portid, clid, new_cl); 2069 } 2070 out: 2071 return err; 2072 } 2073 2074 struct qdisc_dump_args { 2075 struct qdisc_walker w; 2076 struct sk_buff *skb; 2077 struct netlink_callback *cb; 2078 }; 2079 2080 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, 2081 struct qdisc_walker *arg) 2082 { 2083 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 2084 2085 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, 2086 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, 2087 RTM_NEWTCLASS); 2088 } 2089 2090 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, 2091 struct tcmsg *tcm, struct netlink_callback *cb, 2092 int *t_p, int s_t) 2093 { 2094 struct qdisc_dump_args arg; 2095 2096 if (tc_qdisc_dump_ignore(q, false) || 2097 *t_p < s_t || !q->ops->cl_ops || 2098 (tcm->tcm_parent && 2099 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 2100 (*t_p)++; 2101 return 0; 2102 } 2103 if (*t_p > s_t) 2104 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 2105 arg.w.fn = qdisc_class_dump; 2106 arg.skb = skb; 2107 arg.cb = cb; 2108 arg.w.stop = 0; 2109 arg.w.skip = cb->args[1]; 2110 arg.w.count = 0; 2111 q->ops->cl_ops->walk(q, &arg.w); 2112 cb->args[1] = arg.w.count; 2113 if (arg.w.stop) 2114 return -1; 2115 (*t_p)++; 2116 return 0; 2117 } 2118 2119 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, 2120 struct tcmsg *tcm, struct netlink_callback *cb, 2121 int *t_p, int s_t) 2122 { 2123 struct Qdisc *q; 2124 int b; 2125 2126 if (!root) 2127 return 0; 2128 2129 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) 2130 return -1; 2131 2132 if (!qdisc_dev(root)) 2133 return 0; 2134 2135 if (tcm->tcm_parent) { 2136 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); 2137 if (q && q != root && 2138 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2139 return -1; 2140 return 0; 2141 } 2142 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { 2143 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) 2144 return -1; 2145 } 2146 2147 return 0; 2148 } 2149 2150 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 2151 { 2152 struct tcmsg *tcm = nlmsg_data(cb->nlh); 2153 struct net *net = sock_net(skb->sk); 2154 struct netdev_queue *dev_queue; 2155 struct net_device *dev; 2156 int t, s_t; 2157 2158 if (nlmsg_len(cb->nlh) < sizeof(*tcm)) 2159 return 0; 2160 dev = dev_get_by_index(net, tcm->tcm_ifindex); 2161 if (!dev) 2162 return 0; 2163 2164 s_t = cb->args[0]; 2165 t = 0; 2166 2167 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) 2168 goto done; 2169 2170 dev_queue = dev_ingress_queue(dev); 2171 if (dev_queue && 2172 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, 2173 &t, s_t) < 0) 2174 goto done; 2175 2176 done: 2177 cb->args[0] = t; 2178 2179 dev_put(dev); 2180 return skb->len; 2181 } 2182 2183 #ifdef CONFIG_PROC_FS 2184 static int psched_show(struct seq_file *seq, void *v) 2185 { 2186 seq_printf(seq, "%08x %08x %08x %08x\n", 2187 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 2188 1000000, 2189 (u32)NSEC_PER_SEC / hrtimer_resolution); 2190 2191 return 0; 2192 } 2193 2194 static int __net_init psched_net_init(struct net *net) 2195 { 2196 struct proc_dir_entry *e; 2197 2198 e = proc_create_single("psched", 0, net->proc_net, psched_show); 2199 if (e == NULL) 2200 return -ENOMEM; 2201 2202 return 0; 2203 } 2204 2205 static void __net_exit psched_net_exit(struct net *net) 2206 { 2207 remove_proc_entry("psched", net->proc_net); 2208 } 2209 #else 2210 static int __net_init psched_net_init(struct net *net) 2211 { 2212 return 0; 2213 } 2214 2215 static void __net_exit psched_net_exit(struct net *net) 2216 { 2217 } 2218 #endif 2219 2220 static struct pernet_operations psched_net_ops = { 2221 .init = psched_net_init, 2222 .exit = psched_net_exit, 2223 }; 2224 2225 static int __init pktsched_init(void) 2226 { 2227 int err; 2228 2229 err = register_pernet_subsys(&psched_net_ops); 2230 if (err) { 2231 pr_err("pktsched_init: " 2232 "cannot initialize per netns operations\n"); 2233 return err; 2234 } 2235 2236 register_qdisc(&pfifo_fast_ops); 2237 register_qdisc(&pfifo_qdisc_ops); 2238 register_qdisc(&bfifo_qdisc_ops); 2239 register_qdisc(&pfifo_head_drop_qdisc_ops); 2240 register_qdisc(&mq_qdisc_ops); 2241 register_qdisc(&noqueue_qdisc_ops); 2242 2243 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); 2244 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); 2245 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, 2246 0); 2247 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); 2248 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); 2249 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 2250 0); 2251 2252 return 0; 2253 } 2254 2255 subsys_initcall(pktsched_init); 2256