1 /* 2 * net/sched/sch_api.c Packet scheduler API. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Fixes: 12 * 13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 16 */ 17 18 #include <linux/module.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/string.h> 22 #include <linux/errno.h> 23 #include <linux/skbuff.h> 24 #include <linux/init.h> 25 #include <linux/proc_fs.h> 26 #include <linux/seq_file.h> 27 #include <linux/kmod.h> 28 #include <linux/list.h> 29 #include <linux/hrtimer.h> 30 31 #include <net/net_namespace.h> 32 #include <net/sock.h> 33 #include <net/netlink.h> 34 #include <net/pkt_sched.h> 35 36 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, 37 struct Qdisc *old, struct Qdisc *new); 38 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 39 struct Qdisc *q, unsigned long cl, int event); 40 41 /* 42 43 Short review. 44 ------------- 45 46 This file consists of two interrelated parts: 47 48 1. queueing disciplines manager frontend. 49 2. traffic classes manager frontend. 50 51 Generally, queueing discipline ("qdisc") is a black box, 52 which is able to enqueue packets and to dequeue them (when 53 device is ready to send something) in order and at times 54 determined by algorithm hidden in it. 55 56 qdisc's are divided to two categories: 57 - "queues", which have no internal structure visible from outside. 58 - "schedulers", which split all the packets to "traffic classes", 59 using "packet classifiers" (look at cls_api.c) 60 61 In turn, classes may have child qdiscs (as rule, queues) 62 attached to them etc. etc. etc. 63 64 The goal of the routines in this file is to translate 65 information supplied by user in the form of handles 66 to more intelligible for kernel form, to make some sanity 67 checks and part of work, which is common to all qdiscs 68 and to provide rtnetlink notifications. 69 70 All real intelligent work is done inside qdisc modules. 71 72 73 74 Every discipline has two major routines: enqueue and dequeue. 75 76 ---dequeue 77 78 dequeue usually returns a skb to send. It is allowed to return NULL, 79 but it does not mean that queue is empty, it just means that 80 discipline does not want to send anything this time. 81 Queue is really empty if q->q.qlen == 0. 82 For complicated disciplines with multiple queues q->q is not 83 real packet queue, but however q->q.qlen must be valid. 84 85 ---enqueue 86 87 enqueue returns 0, if packet was enqueued successfully. 88 If packet (this one or another one) was dropped, it returns 89 not zero error code. 90 NET_XMIT_DROP - this packet dropped 91 Expected action: do not backoff, but wait until queue will clear. 92 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 93 Expected action: backoff or ignore 94 NET_XMIT_POLICED - dropped by police. 95 Expected action: backoff or error to real-time apps. 96 97 Auxiliary routines: 98 99 ---requeue 100 101 requeues once dequeued packet. It is used for non-standard or 102 just buggy devices, which can defer output even if dev->tbusy=0. 103 104 ---reset 105 106 returns qdisc to initial state: purge all buffers, clear all 107 timers, counters (except for statistics) etc. 108 109 ---init 110 111 initializes newly created qdisc. 112 113 ---destroy 114 115 destroys resources allocated by init and during lifetime of qdisc. 116 117 ---change 118 119 changes qdisc parameters. 120 */ 121 122 /* Protects list of registered TC modules. It is pure SMP lock. */ 123 static DEFINE_RWLOCK(qdisc_mod_lock); 124 125 126 /************************************************ 127 * Queueing disciplines manipulation. * 128 ************************************************/ 129 130 131 /* The list of all installed queueing disciplines. */ 132 133 static struct Qdisc_ops *qdisc_base; 134 135 /* Register/uregister queueing discipline */ 136 137 int register_qdisc(struct Qdisc_ops *qops) 138 { 139 struct Qdisc_ops *q, **qp; 140 int rc = -EEXIST; 141 142 write_lock(&qdisc_mod_lock); 143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 144 if (!strcmp(qops->id, q->id)) 145 goto out; 146 147 if (qops->enqueue == NULL) 148 qops->enqueue = noop_qdisc_ops.enqueue; 149 if (qops->requeue == NULL) 150 qops->requeue = noop_qdisc_ops.requeue; 151 if (qops->dequeue == NULL) 152 qops->dequeue = noop_qdisc_ops.dequeue; 153 154 qops->next = NULL; 155 *qp = qops; 156 rc = 0; 157 out: 158 write_unlock(&qdisc_mod_lock); 159 return rc; 160 } 161 EXPORT_SYMBOL(register_qdisc); 162 163 int unregister_qdisc(struct Qdisc_ops *qops) 164 { 165 struct Qdisc_ops *q, **qp; 166 int err = -ENOENT; 167 168 write_lock(&qdisc_mod_lock); 169 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) 170 if (q == qops) 171 break; 172 if (q) { 173 *qp = q->next; 174 q->next = NULL; 175 err = 0; 176 } 177 write_unlock(&qdisc_mod_lock); 178 return err; 179 } 180 EXPORT_SYMBOL(unregister_qdisc); 181 182 /* We know handle. Find qdisc among all qdisc's attached to device 183 (root qdisc, all its children, children of children etc.) 184 */ 185 186 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 187 { 188 struct Qdisc *q; 189 190 list_for_each_entry(q, &dev->qdisc_list, list) { 191 if (q->handle == handle) 192 return q; 193 } 194 return NULL; 195 } 196 197 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 198 { 199 unsigned long cl; 200 struct Qdisc *leaf; 201 const struct Qdisc_class_ops *cops = p->ops->cl_ops; 202 203 if (cops == NULL) 204 return NULL; 205 cl = cops->get(p, classid); 206 207 if (cl == 0) 208 return NULL; 209 leaf = cops->leaf(p, cl); 210 cops->put(p, cl); 211 return leaf; 212 } 213 214 /* Find queueing discipline by name */ 215 216 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) 217 { 218 struct Qdisc_ops *q = NULL; 219 220 if (kind) { 221 read_lock(&qdisc_mod_lock); 222 for (q = qdisc_base; q; q = q->next) { 223 if (rtattr_strcmp(kind, q->id) == 0) { 224 if (!try_module_get(q->owner)) 225 q = NULL; 226 break; 227 } 228 } 229 read_unlock(&qdisc_mod_lock); 230 } 231 return q; 232 } 233 234 static struct qdisc_rate_table *qdisc_rtab_list; 235 236 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) 237 { 238 struct qdisc_rate_table *rtab; 239 240 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 241 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { 242 rtab->refcnt++; 243 return rtab; 244 } 245 } 246 247 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) 248 return NULL; 249 250 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 251 if (rtab) { 252 rtab->rate = *r; 253 rtab->refcnt = 1; 254 memcpy(rtab->data, RTA_DATA(tab), 1024); 255 rtab->next = qdisc_rtab_list; 256 qdisc_rtab_list = rtab; 257 } 258 return rtab; 259 } 260 EXPORT_SYMBOL(qdisc_get_rtab); 261 262 void qdisc_put_rtab(struct qdisc_rate_table *tab) 263 { 264 struct qdisc_rate_table *rtab, **rtabp; 265 266 if (!tab || --tab->refcnt) 267 return; 268 269 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { 270 if (rtab == tab) { 271 *rtabp = rtab->next; 272 kfree(rtab); 273 return; 274 } 275 } 276 } 277 EXPORT_SYMBOL(qdisc_put_rtab); 278 279 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) 280 { 281 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 282 timer); 283 struct net_device *dev = wd->qdisc->dev; 284 285 wd->qdisc->flags &= ~TCQ_F_THROTTLED; 286 smp_wmb(); 287 netif_schedule(dev); 288 289 return HRTIMER_NORESTART; 290 } 291 292 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) 293 { 294 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 295 wd->timer.function = qdisc_watchdog; 296 wd->qdisc = qdisc; 297 } 298 EXPORT_SYMBOL(qdisc_watchdog_init); 299 300 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) 301 { 302 ktime_t time; 303 304 wd->qdisc->flags |= TCQ_F_THROTTLED; 305 time = ktime_set(0, 0); 306 time = ktime_add_ns(time, PSCHED_US2NS(expires)); 307 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); 308 } 309 EXPORT_SYMBOL(qdisc_watchdog_schedule); 310 311 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 312 { 313 hrtimer_cancel(&wd->timer); 314 wd->qdisc->flags &= ~TCQ_F_THROTTLED; 315 } 316 EXPORT_SYMBOL(qdisc_watchdog_cancel); 317 318 /* Allocate an unique handle from space managed by kernel */ 319 320 static u32 qdisc_alloc_handle(struct net_device *dev) 321 { 322 int i = 0x10000; 323 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 324 325 do { 326 autohandle += TC_H_MAKE(0x10000U, 0); 327 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 328 autohandle = TC_H_MAKE(0x80000000U, 0); 329 } while (qdisc_lookup(dev, autohandle) && --i > 0); 330 331 return i>0 ? autohandle : 0; 332 } 333 334 /* Attach toplevel qdisc to device dev */ 335 336 static struct Qdisc * 337 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) 338 { 339 struct Qdisc *oqdisc; 340 341 if (dev->flags & IFF_UP) 342 dev_deactivate(dev); 343 344 qdisc_lock_tree(dev); 345 if (qdisc && qdisc->flags&TCQ_F_INGRESS) { 346 oqdisc = dev->qdisc_ingress; 347 /* Prune old scheduler */ 348 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { 349 /* delete */ 350 qdisc_reset(oqdisc); 351 dev->qdisc_ingress = NULL; 352 } else { /* new */ 353 dev->qdisc_ingress = qdisc; 354 } 355 356 } else { 357 358 oqdisc = dev->qdisc_sleeping; 359 360 /* Prune old scheduler */ 361 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) 362 qdisc_reset(oqdisc); 363 364 /* ... and graft new one */ 365 if (qdisc == NULL) 366 qdisc = &noop_qdisc; 367 dev->qdisc_sleeping = qdisc; 368 dev->qdisc = &noop_qdisc; 369 } 370 371 qdisc_unlock_tree(dev); 372 373 if (dev->flags & IFF_UP) 374 dev_activate(dev); 375 376 return oqdisc; 377 } 378 379 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) 380 { 381 const struct Qdisc_class_ops *cops; 382 unsigned long cl; 383 u32 parentid; 384 385 if (n == 0) 386 return; 387 while ((parentid = sch->parent)) { 388 sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid)); 389 if (sch == NULL) { 390 WARN_ON(parentid != TC_H_ROOT); 391 return; 392 } 393 cops = sch->ops->cl_ops; 394 if (cops->qlen_notify) { 395 cl = cops->get(sch, parentid); 396 cops->qlen_notify(sch, cl); 397 cops->put(sch, cl); 398 } 399 sch->q.qlen -= n; 400 } 401 } 402 EXPORT_SYMBOL(qdisc_tree_decrease_qlen); 403 404 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 405 to device "dev". 406 407 Old qdisc is not destroyed but returned in *old. 408 */ 409 410 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 411 u32 classid, 412 struct Qdisc *new, struct Qdisc **old) 413 { 414 int err = 0; 415 struct Qdisc *q = *old; 416 417 418 if (parent == NULL) { 419 if (q && q->flags&TCQ_F_INGRESS) { 420 *old = dev_graft_qdisc(dev, q); 421 } else { 422 *old = dev_graft_qdisc(dev, new); 423 } 424 } else { 425 const struct Qdisc_class_ops *cops = parent->ops->cl_ops; 426 427 err = -EINVAL; 428 429 if (cops) { 430 unsigned long cl = cops->get(parent, classid); 431 if (cl) { 432 err = cops->graft(parent, cl, new, old); 433 cops->put(parent, cl); 434 } 435 } 436 } 437 return err; 438 } 439 440 /* 441 Allocate and initialize new qdisc. 442 443 Parameters are passed via opt. 444 */ 445 446 static struct Qdisc * 447 qdisc_create(struct net_device *dev, u32 parent, u32 handle, 448 struct rtattr **tca, int *errp) 449 { 450 int err; 451 struct rtattr *kind = tca[TCA_KIND-1]; 452 struct Qdisc *sch; 453 struct Qdisc_ops *ops; 454 455 ops = qdisc_lookup_ops(kind); 456 #ifdef CONFIG_KMOD 457 if (ops == NULL && kind != NULL) { 458 char name[IFNAMSIZ]; 459 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { 460 /* We dropped the RTNL semaphore in order to 461 * perform the module load. So, even if we 462 * succeeded in loading the module we have to 463 * tell the caller to replay the request. We 464 * indicate this using -EAGAIN. 465 * We replay the request because the device may 466 * go away in the mean time. 467 */ 468 rtnl_unlock(); 469 request_module("sch_%s", name); 470 rtnl_lock(); 471 ops = qdisc_lookup_ops(kind); 472 if (ops != NULL) { 473 /* We will try again qdisc_lookup_ops, 474 * so don't keep a reference. 475 */ 476 module_put(ops->owner); 477 err = -EAGAIN; 478 goto err_out; 479 } 480 } 481 } 482 #endif 483 484 err = -ENOENT; 485 if (ops == NULL) 486 goto err_out; 487 488 sch = qdisc_alloc(dev, ops); 489 if (IS_ERR(sch)) { 490 err = PTR_ERR(sch); 491 goto err_out2; 492 } 493 494 sch->parent = parent; 495 496 if (handle == TC_H_INGRESS) { 497 sch->flags |= TCQ_F_INGRESS; 498 sch->stats_lock = &dev->ingress_lock; 499 handle = TC_H_MAKE(TC_H_INGRESS, 0); 500 } else { 501 sch->stats_lock = &dev->queue_lock; 502 if (handle == 0) { 503 handle = qdisc_alloc_handle(dev); 504 err = -ENOMEM; 505 if (handle == 0) 506 goto err_out3; 507 } 508 } 509 510 sch->handle = handle; 511 512 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 513 if (tca[TCA_RATE-1]) { 514 err = gen_new_estimator(&sch->bstats, &sch->rate_est, 515 sch->stats_lock, 516 tca[TCA_RATE-1]); 517 if (err) { 518 /* 519 * Any broken qdiscs that would require 520 * a ops->reset() here? The qdisc was never 521 * in action so it shouldn't be necessary. 522 */ 523 if (ops->destroy) 524 ops->destroy(sch); 525 goto err_out3; 526 } 527 } 528 qdisc_lock_tree(dev); 529 list_add_tail(&sch->list, &dev->qdisc_list); 530 qdisc_unlock_tree(dev); 531 532 return sch; 533 } 534 err_out3: 535 dev_put(dev); 536 kfree((char *) sch - sch->padded); 537 err_out2: 538 module_put(ops->owner); 539 err_out: 540 *errp = err; 541 return NULL; 542 } 543 544 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) 545 { 546 if (tca[TCA_OPTIONS-1]) { 547 int err; 548 549 if (sch->ops->change == NULL) 550 return -EINVAL; 551 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); 552 if (err) 553 return err; 554 } 555 if (tca[TCA_RATE-1]) 556 gen_replace_estimator(&sch->bstats, &sch->rate_est, 557 sch->stats_lock, tca[TCA_RATE-1]); 558 return 0; 559 } 560 561 struct check_loop_arg 562 { 563 struct qdisc_walker w; 564 struct Qdisc *p; 565 int depth; 566 }; 567 568 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); 569 570 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 571 { 572 struct check_loop_arg arg; 573 574 if (q->ops->cl_ops == NULL) 575 return 0; 576 577 arg.w.stop = arg.w.skip = arg.w.count = 0; 578 arg.w.fn = check_loop_fn; 579 arg.depth = depth; 580 arg.p = p; 581 q->ops->cl_ops->walk(q, &arg.w); 582 return arg.w.stop ? -ELOOP : 0; 583 } 584 585 static int 586 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 587 { 588 struct Qdisc *leaf; 589 const struct Qdisc_class_ops *cops = q->ops->cl_ops; 590 struct check_loop_arg *arg = (struct check_loop_arg *)w; 591 592 leaf = cops->leaf(q, cl); 593 if (leaf) { 594 if (leaf == arg->p || arg->depth > 7) 595 return -ELOOP; 596 return check_loop(leaf, arg->p, arg->depth + 1); 597 } 598 return 0; 599 } 600 601 /* 602 * Delete/get qdisc. 603 */ 604 605 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 606 { 607 struct net *net = skb->sk->sk_net; 608 struct tcmsg *tcm = NLMSG_DATA(n); 609 struct rtattr **tca = arg; 610 struct net_device *dev; 611 u32 clid = tcm->tcm_parent; 612 struct Qdisc *q = NULL; 613 struct Qdisc *p = NULL; 614 int err; 615 616 if (net != &init_net) 617 return -EINVAL; 618 619 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) 620 return -ENODEV; 621 622 if (clid) { 623 if (clid != TC_H_ROOT) { 624 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 625 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 626 return -ENOENT; 627 q = qdisc_leaf(p, clid); 628 } else { /* ingress */ 629 q = dev->qdisc_ingress; 630 } 631 } else { 632 q = dev->qdisc_sleeping; 633 } 634 if (!q) 635 return -ENOENT; 636 637 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) 638 return -EINVAL; 639 } else { 640 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 641 return -ENOENT; 642 } 643 644 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 645 return -EINVAL; 646 647 if (n->nlmsg_type == RTM_DELQDISC) { 648 if (!clid) 649 return -EINVAL; 650 if (q->handle == 0) 651 return -ENOENT; 652 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) 653 return err; 654 if (q) { 655 qdisc_notify(skb, n, clid, q, NULL); 656 qdisc_lock_tree(dev); 657 qdisc_destroy(q); 658 qdisc_unlock_tree(dev); 659 } 660 } else { 661 qdisc_notify(skb, n, clid, NULL, q); 662 } 663 return 0; 664 } 665 666 /* 667 Create/change qdisc. 668 */ 669 670 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 671 { 672 struct net *net = skb->sk->sk_net; 673 struct tcmsg *tcm; 674 struct rtattr **tca; 675 struct net_device *dev; 676 u32 clid; 677 struct Qdisc *q, *p; 678 int err; 679 680 if (net != &init_net) 681 return -EINVAL; 682 683 replay: 684 /* Reinit, just in case something touches this. */ 685 tcm = NLMSG_DATA(n); 686 tca = arg; 687 clid = tcm->tcm_parent; 688 q = p = NULL; 689 690 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) 691 return -ENODEV; 692 693 if (clid) { 694 if (clid != TC_H_ROOT) { 695 if (clid != TC_H_INGRESS) { 696 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 697 return -ENOENT; 698 q = qdisc_leaf(p, clid); 699 } else { /*ingress */ 700 q = dev->qdisc_ingress; 701 } 702 } else { 703 q = dev->qdisc_sleeping; 704 } 705 706 /* It may be default qdisc, ignore it */ 707 if (q && q->handle == 0) 708 q = NULL; 709 710 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 711 if (tcm->tcm_handle) { 712 if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) 713 return -EEXIST; 714 if (TC_H_MIN(tcm->tcm_handle)) 715 return -EINVAL; 716 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 717 goto create_n_graft; 718 if (n->nlmsg_flags&NLM_F_EXCL) 719 return -EEXIST; 720 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 721 return -EINVAL; 722 if (q == p || 723 (p && check_loop(q, p, 0))) 724 return -ELOOP; 725 atomic_inc(&q->refcnt); 726 goto graft; 727 } else { 728 if (q == NULL) 729 goto create_n_graft; 730 731 /* This magic test requires explanation. 732 * 733 * We know, that some child q is already 734 * attached to this parent and have choice: 735 * either to change it or to create/graft new one. 736 * 737 * 1. We are allowed to create/graft only 738 * if CREATE and REPLACE flags are set. 739 * 740 * 2. If EXCL is set, requestor wanted to say, 741 * that qdisc tcm_handle is not expected 742 * to exist, so that we choose create/graft too. 743 * 744 * 3. The last case is when no flags are set. 745 * Alas, it is sort of hole in API, we 746 * cannot decide what to do unambiguously. 747 * For now we select create/graft, if 748 * user gave KIND, which does not match existing. 749 */ 750 if ((n->nlmsg_flags&NLM_F_CREATE) && 751 (n->nlmsg_flags&NLM_F_REPLACE) && 752 ((n->nlmsg_flags&NLM_F_EXCL) || 753 (tca[TCA_KIND-1] && 754 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) 755 goto create_n_graft; 756 } 757 } 758 } else { 759 if (!tcm->tcm_handle) 760 return -EINVAL; 761 q = qdisc_lookup(dev, tcm->tcm_handle); 762 } 763 764 /* Change qdisc parameters */ 765 if (q == NULL) 766 return -ENOENT; 767 if (n->nlmsg_flags&NLM_F_EXCL) 768 return -EEXIST; 769 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 770 return -EINVAL; 771 err = qdisc_change(q, tca); 772 if (err == 0) 773 qdisc_notify(skb, n, clid, NULL, q); 774 return err; 775 776 create_n_graft: 777 if (!(n->nlmsg_flags&NLM_F_CREATE)) 778 return -ENOENT; 779 if (clid == TC_H_INGRESS) 780 q = qdisc_create(dev, tcm->tcm_parent, tcm->tcm_parent, 781 tca, &err); 782 else 783 q = qdisc_create(dev, tcm->tcm_parent, tcm->tcm_handle, 784 tca, &err); 785 if (q == NULL) { 786 if (err == -EAGAIN) 787 goto replay; 788 return err; 789 } 790 791 graft: 792 if (1) { 793 struct Qdisc *old_q = NULL; 794 err = qdisc_graft(dev, p, clid, q, &old_q); 795 if (err) { 796 if (q) { 797 qdisc_lock_tree(dev); 798 qdisc_destroy(q); 799 qdisc_unlock_tree(dev); 800 } 801 return err; 802 } 803 qdisc_notify(skb, n, clid, old_q, q); 804 if (old_q) { 805 qdisc_lock_tree(dev); 806 qdisc_destroy(old_q); 807 qdisc_unlock_tree(dev); 808 } 809 } 810 return 0; 811 } 812 813 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 814 u32 pid, u32 seq, u16 flags, int event) 815 { 816 struct tcmsg *tcm; 817 struct nlmsghdr *nlh; 818 unsigned char *b = skb_tail_pointer(skb); 819 struct gnet_dump d; 820 821 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 822 tcm = NLMSG_DATA(nlh); 823 tcm->tcm_family = AF_UNSPEC; 824 tcm->tcm__pad1 = 0; 825 tcm->tcm__pad2 = 0; 826 tcm->tcm_ifindex = q->dev->ifindex; 827 tcm->tcm_parent = clid; 828 tcm->tcm_handle = q->handle; 829 tcm->tcm_info = atomic_read(&q->refcnt); 830 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 831 if (q->ops->dump && q->ops->dump(q, skb) < 0) 832 goto rtattr_failure; 833 q->qstats.qlen = q->q.qlen; 834 835 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, 836 TCA_XSTATS, q->stats_lock, &d) < 0) 837 goto rtattr_failure; 838 839 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 840 goto rtattr_failure; 841 842 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || 843 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 844 gnet_stats_copy_queue(&d, &q->qstats) < 0) 845 goto rtattr_failure; 846 847 if (gnet_stats_finish_copy(&d) < 0) 848 goto rtattr_failure; 849 850 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 851 return skb->len; 852 853 nlmsg_failure: 854 rtattr_failure: 855 nlmsg_trim(skb, b); 856 return -1; 857 } 858 859 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, 860 u32 clid, struct Qdisc *old, struct Qdisc *new) 861 { 862 struct sk_buff *skb; 863 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 864 865 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 866 if (!skb) 867 return -ENOBUFS; 868 869 if (old && old->handle) { 870 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) 871 goto err_out; 872 } 873 if (new) { 874 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 875 goto err_out; 876 } 877 878 if (skb->len) 879 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 880 881 err_out: 882 kfree_skb(skb); 883 return -EINVAL; 884 } 885 886 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 887 { 888 struct net *net = skb->sk->sk_net; 889 int idx, q_idx; 890 int s_idx, s_q_idx; 891 struct net_device *dev; 892 struct Qdisc *q; 893 894 if (net != &init_net) 895 return 0; 896 897 s_idx = cb->args[0]; 898 s_q_idx = q_idx = cb->args[1]; 899 read_lock(&dev_base_lock); 900 idx = 0; 901 for_each_netdev(&init_net, dev) { 902 if (idx < s_idx) 903 goto cont; 904 if (idx > s_idx) 905 s_q_idx = 0; 906 q_idx = 0; 907 list_for_each_entry(q, &dev->qdisc_list, list) { 908 if (q_idx < s_q_idx) { 909 q_idx++; 910 continue; 911 } 912 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, 913 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) 914 goto done; 915 q_idx++; 916 } 917 cont: 918 idx++; 919 } 920 921 done: 922 read_unlock(&dev_base_lock); 923 924 cb->args[0] = idx; 925 cb->args[1] = q_idx; 926 927 return skb->len; 928 } 929 930 931 932 /************************************************ 933 * Traffic classes manipulation. * 934 ************************************************/ 935 936 937 938 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 939 { 940 struct net *net = skb->sk->sk_net; 941 struct tcmsg *tcm = NLMSG_DATA(n); 942 struct rtattr **tca = arg; 943 struct net_device *dev; 944 struct Qdisc *q = NULL; 945 const struct Qdisc_class_ops *cops; 946 unsigned long cl = 0; 947 unsigned long new_cl; 948 u32 pid = tcm->tcm_parent; 949 u32 clid = tcm->tcm_handle; 950 u32 qid = TC_H_MAJ(clid); 951 int err; 952 953 if (net != &init_net) 954 return -EINVAL; 955 956 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) 957 return -ENODEV; 958 959 /* 960 parent == TC_H_UNSPEC - unspecified parent. 961 parent == TC_H_ROOT - class is root, which has no parent. 962 parent == X:0 - parent is root class. 963 parent == X:Y - parent is a node in hierarchy. 964 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 965 966 handle == 0:0 - generate handle from kernel pool. 967 handle == 0:Y - class is X:Y, where X:0 is qdisc. 968 handle == X:Y - clear. 969 handle == X:0 - root class. 970 */ 971 972 /* Step 1. Determine qdisc handle X:0 */ 973 974 if (pid != TC_H_ROOT) { 975 u32 qid1 = TC_H_MAJ(pid); 976 977 if (qid && qid1) { 978 /* If both majors are known, they must be identical. */ 979 if (qid != qid1) 980 return -EINVAL; 981 } else if (qid1) { 982 qid = qid1; 983 } else if (qid == 0) 984 qid = dev->qdisc_sleeping->handle; 985 986 /* Now qid is genuine qdisc handle consistent 987 both with parent and child. 988 989 TC_H_MAJ(pid) still may be unspecified, complete it now. 990 */ 991 if (pid) 992 pid = TC_H_MAKE(qid, pid); 993 } else { 994 if (qid == 0) 995 qid = dev->qdisc_sleeping->handle; 996 } 997 998 /* OK. Locate qdisc */ 999 if ((q = qdisc_lookup(dev, qid)) == NULL) 1000 return -ENOENT; 1001 1002 /* An check that it supports classes */ 1003 cops = q->ops->cl_ops; 1004 if (cops == NULL) 1005 return -EINVAL; 1006 1007 /* Now try to get class */ 1008 if (clid == 0) { 1009 if (pid == TC_H_ROOT) 1010 clid = qid; 1011 } else 1012 clid = TC_H_MAKE(qid, clid); 1013 1014 if (clid) 1015 cl = cops->get(q, clid); 1016 1017 if (cl == 0) { 1018 err = -ENOENT; 1019 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) 1020 goto out; 1021 } else { 1022 switch (n->nlmsg_type) { 1023 case RTM_NEWTCLASS: 1024 err = -EEXIST; 1025 if (n->nlmsg_flags&NLM_F_EXCL) 1026 goto out; 1027 break; 1028 case RTM_DELTCLASS: 1029 err = cops->delete(q, cl); 1030 if (err == 0) 1031 tclass_notify(skb, n, q, cl, RTM_DELTCLASS); 1032 goto out; 1033 case RTM_GETTCLASS: 1034 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); 1035 goto out; 1036 default: 1037 err = -EINVAL; 1038 goto out; 1039 } 1040 } 1041 1042 new_cl = cl; 1043 err = cops->change(q, clid, pid, tca, &new_cl); 1044 if (err == 0) 1045 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); 1046 1047 out: 1048 if (cl) 1049 cops->put(q, cl); 1050 1051 return err; 1052 } 1053 1054 1055 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 1056 unsigned long cl, 1057 u32 pid, u32 seq, u16 flags, int event) 1058 { 1059 struct tcmsg *tcm; 1060 struct nlmsghdr *nlh; 1061 unsigned char *b = skb_tail_pointer(skb); 1062 struct gnet_dump d; 1063 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 1064 1065 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 1066 tcm = NLMSG_DATA(nlh); 1067 tcm->tcm_family = AF_UNSPEC; 1068 tcm->tcm_ifindex = q->dev->ifindex; 1069 tcm->tcm_parent = q->handle; 1070 tcm->tcm_handle = q->handle; 1071 tcm->tcm_info = 0; 1072 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 1073 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1074 goto rtattr_failure; 1075 1076 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, 1077 TCA_XSTATS, q->stats_lock, &d) < 0) 1078 goto rtattr_failure; 1079 1080 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1081 goto rtattr_failure; 1082 1083 if (gnet_stats_finish_copy(&d) < 0) 1084 goto rtattr_failure; 1085 1086 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 1087 return skb->len; 1088 1089 nlmsg_failure: 1090 rtattr_failure: 1091 nlmsg_trim(skb, b); 1092 return -1; 1093 } 1094 1095 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 1096 struct Qdisc *q, unsigned long cl, int event) 1097 { 1098 struct sk_buff *skb; 1099 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 1100 1101 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1102 if (!skb) 1103 return -ENOBUFS; 1104 1105 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { 1106 kfree_skb(skb); 1107 return -EINVAL; 1108 } 1109 1110 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 1111 } 1112 1113 struct qdisc_dump_args 1114 { 1115 struct qdisc_walker w; 1116 struct sk_buff *skb; 1117 struct netlink_callback *cb; 1118 }; 1119 1120 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1121 { 1122 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 1123 1124 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, 1125 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); 1126 } 1127 1128 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 1129 { 1130 struct net *net = skb->sk->sk_net; 1131 int t; 1132 int s_t; 1133 struct net_device *dev; 1134 struct Qdisc *q; 1135 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); 1136 struct qdisc_dump_args arg; 1137 1138 if (net != &init_net) 1139 return 0; 1140 1141 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 1142 return 0; 1143 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL) 1144 return 0; 1145 1146 s_t = cb->args[0]; 1147 t = 0; 1148 1149 list_for_each_entry(q, &dev->qdisc_list, list) { 1150 if (t < s_t || !q->ops->cl_ops || 1151 (tcm->tcm_parent && 1152 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 1153 t++; 1154 continue; 1155 } 1156 if (t > s_t) 1157 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 1158 arg.w.fn = qdisc_class_dump; 1159 arg.skb = skb; 1160 arg.cb = cb; 1161 arg.w.stop = 0; 1162 arg.w.skip = cb->args[1]; 1163 arg.w.count = 0; 1164 q->ops->cl_ops->walk(q, &arg.w); 1165 cb->args[1] = arg.w.count; 1166 if (arg.w.stop) 1167 break; 1168 t++; 1169 } 1170 1171 cb->args[0] = t; 1172 1173 dev_put(dev); 1174 return skb->len; 1175 } 1176 1177 /* Main classifier routine: scans classifier chain attached 1178 to this qdisc, (optionally) tests for protocol and asks 1179 specific classifiers. 1180 */ 1181 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp, 1182 struct tcf_result *res) 1183 { 1184 __be16 protocol = skb->protocol; 1185 int err = 0; 1186 1187 for (; tp; tp = tp->next) { 1188 if ((tp->protocol == protocol || 1189 tp->protocol == htons(ETH_P_ALL)) && 1190 (err = tp->classify(skb, tp, res)) >= 0) { 1191 #ifdef CONFIG_NET_CLS_ACT 1192 if (err != TC_ACT_RECLASSIFY && skb->tc_verd) 1193 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); 1194 #endif 1195 return err; 1196 } 1197 } 1198 return -1; 1199 } 1200 EXPORT_SYMBOL(tc_classify_compat); 1201 1202 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, 1203 struct tcf_result *res) 1204 { 1205 int err = 0; 1206 __be16 protocol; 1207 #ifdef CONFIG_NET_CLS_ACT 1208 struct tcf_proto *otp = tp; 1209 reclassify: 1210 #endif 1211 protocol = skb->protocol; 1212 1213 err = tc_classify_compat(skb, tp, res); 1214 #ifdef CONFIG_NET_CLS_ACT 1215 if (err == TC_ACT_RECLASSIFY) { 1216 u32 verd = G_TC_VERD(skb->tc_verd); 1217 tp = otp; 1218 1219 if (verd++ >= MAX_REC_LOOP) { 1220 printk("rule prio %u protocol %02x reclassify loop, " 1221 "packet dropped\n", 1222 tp->prio&0xffff, ntohs(tp->protocol)); 1223 return TC_ACT_SHOT; 1224 } 1225 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); 1226 goto reclassify; 1227 } 1228 #endif 1229 return err; 1230 } 1231 EXPORT_SYMBOL(tc_classify); 1232 1233 void tcf_destroy(struct tcf_proto *tp) 1234 { 1235 tp->ops->destroy(tp); 1236 module_put(tp->ops->owner); 1237 kfree(tp); 1238 } 1239 1240 void tcf_destroy_chain(struct tcf_proto *fl) 1241 { 1242 struct tcf_proto *tp; 1243 1244 while ((tp = fl) != NULL) { 1245 fl = tp->next; 1246 tcf_destroy(tp); 1247 } 1248 } 1249 EXPORT_SYMBOL(tcf_destroy_chain); 1250 1251 #ifdef CONFIG_PROC_FS 1252 static int psched_show(struct seq_file *seq, void *v) 1253 { 1254 struct timespec ts; 1255 1256 hrtimer_get_res(CLOCK_MONOTONIC, &ts); 1257 seq_printf(seq, "%08x %08x %08x %08x\n", 1258 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1), 1259 1000000, 1260 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); 1261 1262 return 0; 1263 } 1264 1265 static int psched_open(struct inode *inode, struct file *file) 1266 { 1267 return single_open(file, psched_show, PDE(inode)->data); 1268 } 1269 1270 static const struct file_operations psched_fops = { 1271 .owner = THIS_MODULE, 1272 .open = psched_open, 1273 .read = seq_read, 1274 .llseek = seq_lseek, 1275 .release = single_release, 1276 }; 1277 #endif 1278 1279 static int __init pktsched_init(void) 1280 { 1281 register_qdisc(&pfifo_qdisc_ops); 1282 register_qdisc(&bfifo_qdisc_ops); 1283 proc_net_fops_create(&init_net, "psched", 0, &psched_fops); 1284 1285 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL); 1286 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL); 1287 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc); 1288 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL); 1289 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL); 1290 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass); 1291 1292 return 0; 1293 } 1294 1295 subsys_initcall(pktsched_init); 1296