1 /* 2 * net/sched/sch_api.c Packet scheduler API. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Fixes: 12 * 13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 16 */ 17 18 #include <linux/config.h> 19 #include <linux/module.h> 20 #include <linux/types.h> 21 #include <linux/kernel.h> 22 #include <linux/sched.h> 23 #include <linux/string.h> 24 #include <linux/mm.h> 25 #include <linux/socket.h> 26 #include <linux/sockios.h> 27 #include <linux/in.h> 28 #include <linux/errno.h> 29 #include <linux/interrupt.h> 30 #include <linux/netdevice.h> 31 #include <linux/skbuff.h> 32 #include <linux/rtnetlink.h> 33 #include <linux/init.h> 34 #include <linux/proc_fs.h> 35 #include <linux/seq_file.h> 36 #include <linux/kmod.h> 37 #include <linux/list.h> 38 #include <linux/bitops.h> 39 40 #include <net/sock.h> 41 #include <net/pkt_sched.h> 42 43 #include <asm/processor.h> 44 #include <asm/uaccess.h> 45 #include <asm/system.h> 46 47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, 48 struct Qdisc *old, struct Qdisc *new); 49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 50 struct Qdisc *q, unsigned long cl, int event); 51 52 /* 53 54 Short review. 55 ------------- 56 57 This file consists of two interrelated parts: 58 59 1. queueing disciplines manager frontend. 60 2. traffic classes manager frontend. 61 62 Generally, queueing discipline ("qdisc") is a black box, 63 which is able to enqueue packets and to dequeue them (when 64 device is ready to send something) in order and at times 65 determined by algorithm hidden in it. 66 67 qdisc's are divided to two categories: 68 - "queues", which have no internal structure visible from outside. 69 - "schedulers", which split all the packets to "traffic classes", 70 using "packet classifiers" (look at cls_api.c) 71 72 In turn, classes may have child qdiscs (as rule, queues) 73 attached to them etc. etc. etc. 74 75 The goal of the routines in this file is to translate 76 information supplied by user in the form of handles 77 to more intelligible for kernel form, to make some sanity 78 checks and part of work, which is common to all qdiscs 79 and to provide rtnetlink notifications. 80 81 All real intelligent work is done inside qdisc modules. 82 83 84 85 Every discipline has two major routines: enqueue and dequeue. 86 87 ---dequeue 88 89 dequeue usually returns a skb to send. It is allowed to return NULL, 90 but it does not mean that queue is empty, it just means that 91 discipline does not want to send anything this time. 92 Queue is really empty if q->q.qlen == 0. 93 For complicated disciplines with multiple queues q->q is not 94 real packet queue, but however q->q.qlen must be valid. 95 96 ---enqueue 97 98 enqueue returns 0, if packet was enqueued successfully. 99 If packet (this one or another one) was dropped, it returns 100 not zero error code. 101 NET_XMIT_DROP - this packet dropped 102 Expected action: do not backoff, but wait until queue will clear. 103 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 104 Expected action: backoff or ignore 105 NET_XMIT_POLICED - dropped by police. 106 Expected action: backoff or error to real-time apps. 107 108 Auxiliary routines: 109 110 ---requeue 111 112 requeues once dequeued packet. It is used for non-standard or 113 just buggy devices, which can defer output even if dev->tbusy=0. 114 115 ---reset 116 117 returns qdisc to initial state: purge all buffers, clear all 118 timers, counters (except for statistics) etc. 119 120 ---init 121 122 initializes newly created qdisc. 123 124 ---destroy 125 126 destroys resources allocated by init and during lifetime of qdisc. 127 128 ---change 129 130 changes qdisc parameters. 131 */ 132 133 /* Protects list of registered TC modules. It is pure SMP lock. */ 134 static DEFINE_RWLOCK(qdisc_mod_lock); 135 136 137 /************************************************ 138 * Queueing disciplines manipulation. * 139 ************************************************/ 140 141 142 /* The list of all installed queueing disciplines. */ 143 144 static struct Qdisc_ops *qdisc_base; 145 146 /* Register/uregister queueing discipline */ 147 148 int register_qdisc(struct Qdisc_ops *qops) 149 { 150 struct Qdisc_ops *q, **qp; 151 int rc = -EEXIST; 152 153 write_lock(&qdisc_mod_lock); 154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) 155 if (!strcmp(qops->id, q->id)) 156 goto out; 157 158 if (qops->enqueue == NULL) 159 qops->enqueue = noop_qdisc_ops.enqueue; 160 if (qops->requeue == NULL) 161 qops->requeue = noop_qdisc_ops.requeue; 162 if (qops->dequeue == NULL) 163 qops->dequeue = noop_qdisc_ops.dequeue; 164 165 qops->next = NULL; 166 *qp = qops; 167 rc = 0; 168 out: 169 write_unlock(&qdisc_mod_lock); 170 return rc; 171 } 172 173 int unregister_qdisc(struct Qdisc_ops *qops) 174 { 175 struct Qdisc_ops *q, **qp; 176 int err = -ENOENT; 177 178 write_lock(&qdisc_mod_lock); 179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) 180 if (q == qops) 181 break; 182 if (q) { 183 *qp = q->next; 184 q->next = NULL; 185 err = 0; 186 } 187 write_unlock(&qdisc_mod_lock); 188 return err; 189 } 190 191 /* We know handle. Find qdisc among all qdisc's attached to device 192 (root qdisc, all its children, children of children etc.) 193 */ 194 195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 196 { 197 struct Qdisc *q; 198 199 read_lock_bh(&qdisc_tree_lock); 200 list_for_each_entry(q, &dev->qdisc_list, list) { 201 if (q->handle == handle) { 202 read_unlock_bh(&qdisc_tree_lock); 203 return q; 204 } 205 } 206 read_unlock_bh(&qdisc_tree_lock); 207 return NULL; 208 } 209 210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 211 { 212 unsigned long cl; 213 struct Qdisc *leaf; 214 struct Qdisc_class_ops *cops = p->ops->cl_ops; 215 216 if (cops == NULL) 217 return NULL; 218 cl = cops->get(p, classid); 219 220 if (cl == 0) 221 return NULL; 222 leaf = cops->leaf(p, cl); 223 cops->put(p, cl); 224 return leaf; 225 } 226 227 /* Find queueing discipline by name */ 228 229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) 230 { 231 struct Qdisc_ops *q = NULL; 232 233 if (kind) { 234 read_lock(&qdisc_mod_lock); 235 for (q = qdisc_base; q; q = q->next) { 236 if (rtattr_strcmp(kind, q->id) == 0) { 237 if (!try_module_get(q->owner)) 238 q = NULL; 239 break; 240 } 241 } 242 read_unlock(&qdisc_mod_lock); 243 } 244 return q; 245 } 246 247 static struct qdisc_rate_table *qdisc_rtab_list; 248 249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) 250 { 251 struct qdisc_rate_table *rtab; 252 253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { 255 rtab->refcnt++; 256 return rtab; 257 } 258 } 259 260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) 261 return NULL; 262 263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 264 if (rtab) { 265 rtab->rate = *r; 266 rtab->refcnt = 1; 267 memcpy(rtab->data, RTA_DATA(tab), 1024); 268 rtab->next = qdisc_rtab_list; 269 qdisc_rtab_list = rtab; 270 } 271 return rtab; 272 } 273 274 void qdisc_put_rtab(struct qdisc_rate_table *tab) 275 { 276 struct qdisc_rate_table *rtab, **rtabp; 277 278 if (!tab || --tab->refcnt) 279 return; 280 281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { 282 if (rtab == tab) { 283 *rtabp = rtab->next; 284 kfree(rtab); 285 return; 286 } 287 } 288 } 289 290 291 /* Allocate an unique handle from space managed by kernel */ 292 293 static u32 qdisc_alloc_handle(struct net_device *dev) 294 { 295 int i = 0x10000; 296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 297 298 do { 299 autohandle += TC_H_MAKE(0x10000U, 0); 300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 301 autohandle = TC_H_MAKE(0x80000000U, 0); 302 } while (qdisc_lookup(dev, autohandle) && --i > 0); 303 304 return i>0 ? autohandle : 0; 305 } 306 307 /* Attach toplevel qdisc to device dev */ 308 309 static struct Qdisc * 310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) 311 { 312 struct Qdisc *oqdisc; 313 314 if (dev->flags & IFF_UP) 315 dev_deactivate(dev); 316 317 qdisc_lock_tree(dev); 318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) { 319 oqdisc = dev->qdisc_ingress; 320 /* Prune old scheduler */ 321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { 322 /* delete */ 323 qdisc_reset(oqdisc); 324 dev->qdisc_ingress = NULL; 325 } else { /* new */ 326 dev->qdisc_ingress = qdisc; 327 } 328 329 } else { 330 331 oqdisc = dev->qdisc_sleeping; 332 333 /* Prune old scheduler */ 334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) 335 qdisc_reset(oqdisc); 336 337 /* ... and graft new one */ 338 if (qdisc == NULL) 339 qdisc = &noop_qdisc; 340 dev->qdisc_sleeping = qdisc; 341 dev->qdisc = &noop_qdisc; 342 } 343 344 qdisc_unlock_tree(dev); 345 346 if (dev->flags & IFF_UP) 347 dev_activate(dev); 348 349 return oqdisc; 350 } 351 352 353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or 354 to device "dev". 355 356 Old qdisc is not destroyed but returned in *old. 357 */ 358 359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, 360 u32 classid, 361 struct Qdisc *new, struct Qdisc **old) 362 { 363 int err = 0; 364 struct Qdisc *q = *old; 365 366 367 if (parent == NULL) { 368 if (q && q->flags&TCQ_F_INGRESS) { 369 *old = dev_graft_qdisc(dev, q); 370 } else { 371 *old = dev_graft_qdisc(dev, new); 372 } 373 } else { 374 struct Qdisc_class_ops *cops = parent->ops->cl_ops; 375 376 err = -EINVAL; 377 378 if (cops) { 379 unsigned long cl = cops->get(parent, classid); 380 if (cl) { 381 err = cops->graft(parent, cl, new, old); 382 if (new) 383 new->parent = classid; 384 cops->put(parent, cl); 385 } 386 } 387 } 388 return err; 389 } 390 391 /* 392 Allocate and initialize new qdisc. 393 394 Parameters are passed via opt. 395 */ 396 397 static struct Qdisc * 398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) 399 { 400 int err; 401 struct rtattr *kind = tca[TCA_KIND-1]; 402 struct Qdisc *sch; 403 struct Qdisc_ops *ops; 404 405 ops = qdisc_lookup_ops(kind); 406 #ifdef CONFIG_KMOD 407 if (ops == NULL && kind != NULL) { 408 char name[IFNAMSIZ]; 409 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { 410 /* We dropped the RTNL semaphore in order to 411 * perform the module load. So, even if we 412 * succeeded in loading the module we have to 413 * tell the caller to replay the request. We 414 * indicate this using -EAGAIN. 415 * We replay the request because the device may 416 * go away in the mean time. 417 */ 418 rtnl_unlock(); 419 request_module("sch_%s", name); 420 rtnl_lock(); 421 ops = qdisc_lookup_ops(kind); 422 if (ops != NULL) { 423 /* We will try again qdisc_lookup_ops, 424 * so don't keep a reference. 425 */ 426 module_put(ops->owner); 427 err = -EAGAIN; 428 goto err_out; 429 } 430 } 431 } 432 #endif 433 434 err = -EINVAL; 435 if (ops == NULL) 436 goto err_out; 437 438 sch = qdisc_alloc(dev, ops); 439 if (IS_ERR(sch)) { 440 err = PTR_ERR(sch); 441 goto err_out2; 442 } 443 444 if (handle == TC_H_INGRESS) { 445 sch->flags |= TCQ_F_INGRESS; 446 handle = TC_H_MAKE(TC_H_INGRESS, 0); 447 } else if (handle == 0) { 448 handle = qdisc_alloc_handle(dev); 449 err = -ENOMEM; 450 if (handle == 0) 451 goto err_out3; 452 } 453 454 sch->handle = handle; 455 456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 457 #ifdef CONFIG_NET_ESTIMATOR 458 if (tca[TCA_RATE-1]) { 459 err = gen_new_estimator(&sch->bstats, &sch->rate_est, 460 sch->stats_lock, 461 tca[TCA_RATE-1]); 462 if (err) { 463 /* 464 * Any broken qdiscs that would require 465 * a ops->reset() here? The qdisc was never 466 * in action so it shouldn't be necessary. 467 */ 468 if (ops->destroy) 469 ops->destroy(sch); 470 goto err_out3; 471 } 472 } 473 #endif 474 qdisc_lock_tree(dev); 475 list_add_tail(&sch->list, &dev->qdisc_list); 476 qdisc_unlock_tree(dev); 477 478 return sch; 479 } 480 err_out3: 481 dev_put(dev); 482 kfree((char *) sch - sch->padded); 483 err_out2: 484 module_put(ops->owner); 485 err_out: 486 *errp = err; 487 return NULL; 488 } 489 490 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) 491 { 492 if (tca[TCA_OPTIONS-1]) { 493 int err; 494 495 if (sch->ops->change == NULL) 496 return -EINVAL; 497 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); 498 if (err) 499 return err; 500 } 501 #ifdef CONFIG_NET_ESTIMATOR 502 if (tca[TCA_RATE-1]) 503 gen_replace_estimator(&sch->bstats, &sch->rate_est, 504 sch->stats_lock, tca[TCA_RATE-1]); 505 #endif 506 return 0; 507 } 508 509 struct check_loop_arg 510 { 511 struct qdisc_walker w; 512 struct Qdisc *p; 513 int depth; 514 }; 515 516 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); 517 518 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 519 { 520 struct check_loop_arg arg; 521 522 if (q->ops->cl_ops == NULL) 523 return 0; 524 525 arg.w.stop = arg.w.skip = arg.w.count = 0; 526 arg.w.fn = check_loop_fn; 527 arg.depth = depth; 528 arg.p = p; 529 q->ops->cl_ops->walk(q, &arg.w); 530 return arg.w.stop ? -ELOOP : 0; 531 } 532 533 static int 534 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 535 { 536 struct Qdisc *leaf; 537 struct Qdisc_class_ops *cops = q->ops->cl_ops; 538 struct check_loop_arg *arg = (struct check_loop_arg *)w; 539 540 leaf = cops->leaf(q, cl); 541 if (leaf) { 542 if (leaf == arg->p || arg->depth > 7) 543 return -ELOOP; 544 return check_loop(leaf, arg->p, arg->depth + 1); 545 } 546 return 0; 547 } 548 549 /* 550 * Delete/get qdisc. 551 */ 552 553 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 554 { 555 struct tcmsg *tcm = NLMSG_DATA(n); 556 struct rtattr **tca = arg; 557 struct net_device *dev; 558 u32 clid = tcm->tcm_parent; 559 struct Qdisc *q = NULL; 560 struct Qdisc *p = NULL; 561 int err; 562 563 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 564 return -ENODEV; 565 566 if (clid) { 567 if (clid != TC_H_ROOT) { 568 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 569 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 570 return -ENOENT; 571 q = qdisc_leaf(p, clid); 572 } else { /* ingress */ 573 q = dev->qdisc_ingress; 574 } 575 } else { 576 q = dev->qdisc_sleeping; 577 } 578 if (!q) 579 return -ENOENT; 580 581 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) 582 return -EINVAL; 583 } else { 584 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 585 return -ENOENT; 586 } 587 588 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 589 return -EINVAL; 590 591 if (n->nlmsg_type == RTM_DELQDISC) { 592 if (!clid) 593 return -EINVAL; 594 if (q->handle == 0) 595 return -ENOENT; 596 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) 597 return err; 598 if (q) { 599 qdisc_notify(skb, n, clid, q, NULL); 600 spin_lock_bh(&dev->queue_lock); 601 qdisc_destroy(q); 602 spin_unlock_bh(&dev->queue_lock); 603 } 604 } else { 605 qdisc_notify(skb, n, clid, NULL, q); 606 } 607 return 0; 608 } 609 610 /* 611 Create/change qdisc. 612 */ 613 614 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 615 { 616 struct tcmsg *tcm; 617 struct rtattr **tca; 618 struct net_device *dev; 619 u32 clid; 620 struct Qdisc *q, *p; 621 int err; 622 623 replay: 624 /* Reinit, just in case something touches this. */ 625 tcm = NLMSG_DATA(n); 626 tca = arg; 627 clid = tcm->tcm_parent; 628 q = p = NULL; 629 630 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 631 return -ENODEV; 632 633 if (clid) { 634 if (clid != TC_H_ROOT) { 635 if (clid != TC_H_INGRESS) { 636 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 637 return -ENOENT; 638 q = qdisc_leaf(p, clid); 639 } else { /*ingress */ 640 q = dev->qdisc_ingress; 641 } 642 } else { 643 q = dev->qdisc_sleeping; 644 } 645 646 /* It may be default qdisc, ignore it */ 647 if (q && q->handle == 0) 648 q = NULL; 649 650 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 651 if (tcm->tcm_handle) { 652 if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) 653 return -EEXIST; 654 if (TC_H_MIN(tcm->tcm_handle)) 655 return -EINVAL; 656 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 657 goto create_n_graft; 658 if (n->nlmsg_flags&NLM_F_EXCL) 659 return -EEXIST; 660 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 661 return -EINVAL; 662 if (q == p || 663 (p && check_loop(q, p, 0))) 664 return -ELOOP; 665 atomic_inc(&q->refcnt); 666 goto graft; 667 } else { 668 if (q == NULL) 669 goto create_n_graft; 670 671 /* This magic test requires explanation. 672 * 673 * We know, that some child q is already 674 * attached to this parent and have choice: 675 * either to change it or to create/graft new one. 676 * 677 * 1. We are allowed to create/graft only 678 * if CREATE and REPLACE flags are set. 679 * 680 * 2. If EXCL is set, requestor wanted to say, 681 * that qdisc tcm_handle is not expected 682 * to exist, so that we choose create/graft too. 683 * 684 * 3. The last case is when no flags are set. 685 * Alas, it is sort of hole in API, we 686 * cannot decide what to do unambiguously. 687 * For now we select create/graft, if 688 * user gave KIND, which does not match existing. 689 */ 690 if ((n->nlmsg_flags&NLM_F_CREATE) && 691 (n->nlmsg_flags&NLM_F_REPLACE) && 692 ((n->nlmsg_flags&NLM_F_EXCL) || 693 (tca[TCA_KIND-1] && 694 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) 695 goto create_n_graft; 696 } 697 } 698 } else { 699 if (!tcm->tcm_handle) 700 return -EINVAL; 701 q = qdisc_lookup(dev, tcm->tcm_handle); 702 } 703 704 /* Change qdisc parameters */ 705 if (q == NULL) 706 return -ENOENT; 707 if (n->nlmsg_flags&NLM_F_EXCL) 708 return -EEXIST; 709 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 710 return -EINVAL; 711 err = qdisc_change(q, tca); 712 if (err == 0) 713 qdisc_notify(skb, n, clid, NULL, q); 714 return err; 715 716 create_n_graft: 717 if (!(n->nlmsg_flags&NLM_F_CREATE)) 718 return -ENOENT; 719 if (clid == TC_H_INGRESS) 720 q = qdisc_create(dev, tcm->tcm_parent, tca, &err); 721 else 722 q = qdisc_create(dev, tcm->tcm_handle, tca, &err); 723 if (q == NULL) { 724 if (err == -EAGAIN) 725 goto replay; 726 return err; 727 } 728 729 graft: 730 if (1) { 731 struct Qdisc *old_q = NULL; 732 err = qdisc_graft(dev, p, clid, q, &old_q); 733 if (err) { 734 if (q) { 735 spin_lock_bh(&dev->queue_lock); 736 qdisc_destroy(q); 737 spin_unlock_bh(&dev->queue_lock); 738 } 739 return err; 740 } 741 qdisc_notify(skb, n, clid, old_q, q); 742 if (old_q) { 743 spin_lock_bh(&dev->queue_lock); 744 qdisc_destroy(old_q); 745 spin_unlock_bh(&dev->queue_lock); 746 } 747 } 748 return 0; 749 } 750 751 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 752 u32 pid, u32 seq, u16 flags, int event) 753 { 754 struct tcmsg *tcm; 755 struct nlmsghdr *nlh; 756 unsigned char *b = skb->tail; 757 struct gnet_dump d; 758 759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 760 tcm = NLMSG_DATA(nlh); 761 tcm->tcm_family = AF_UNSPEC; 762 tcm->tcm__pad1 = 0; 763 tcm->tcm__pad2 = 0; 764 tcm->tcm_ifindex = q->dev->ifindex; 765 tcm->tcm_parent = clid; 766 tcm->tcm_handle = q->handle; 767 tcm->tcm_info = atomic_read(&q->refcnt); 768 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 769 if (q->ops->dump && q->ops->dump(q, skb) < 0) 770 goto rtattr_failure; 771 q->qstats.qlen = q->q.qlen; 772 773 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, 774 TCA_XSTATS, q->stats_lock, &d) < 0) 775 goto rtattr_failure; 776 777 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) 778 goto rtattr_failure; 779 780 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || 781 #ifdef CONFIG_NET_ESTIMATOR 782 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || 783 #endif 784 gnet_stats_copy_queue(&d, &q->qstats) < 0) 785 goto rtattr_failure; 786 787 if (gnet_stats_finish_copy(&d) < 0) 788 goto rtattr_failure; 789 790 nlh->nlmsg_len = skb->tail - b; 791 return skb->len; 792 793 nlmsg_failure: 794 rtattr_failure: 795 skb_trim(skb, b - skb->data); 796 return -1; 797 } 798 799 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, 800 u32 clid, struct Qdisc *old, struct Qdisc *new) 801 { 802 struct sk_buff *skb; 803 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 804 805 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 806 if (!skb) 807 return -ENOBUFS; 808 809 if (old && old->handle) { 810 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) 811 goto err_out; 812 } 813 if (new) { 814 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 815 goto err_out; 816 } 817 818 if (skb->len) 819 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 820 821 err_out: 822 kfree_skb(skb); 823 return -EINVAL; 824 } 825 826 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 827 { 828 int idx, q_idx; 829 int s_idx, s_q_idx; 830 struct net_device *dev; 831 struct Qdisc *q; 832 833 s_idx = cb->args[0]; 834 s_q_idx = q_idx = cb->args[1]; 835 read_lock(&dev_base_lock); 836 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { 837 if (idx < s_idx) 838 continue; 839 if (idx > s_idx) 840 s_q_idx = 0; 841 read_lock_bh(&qdisc_tree_lock); 842 q_idx = 0; 843 list_for_each_entry(q, &dev->qdisc_list, list) { 844 if (q_idx < s_q_idx) { 845 q_idx++; 846 continue; 847 } 848 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, 849 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { 850 read_unlock_bh(&qdisc_tree_lock); 851 goto done; 852 } 853 q_idx++; 854 } 855 read_unlock_bh(&qdisc_tree_lock); 856 } 857 858 done: 859 read_unlock(&dev_base_lock); 860 861 cb->args[0] = idx; 862 cb->args[1] = q_idx; 863 864 return skb->len; 865 } 866 867 868 869 /************************************************ 870 * Traffic classes manipulation. * 871 ************************************************/ 872 873 874 875 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 876 { 877 struct tcmsg *tcm = NLMSG_DATA(n); 878 struct rtattr **tca = arg; 879 struct net_device *dev; 880 struct Qdisc *q = NULL; 881 struct Qdisc_class_ops *cops; 882 unsigned long cl = 0; 883 unsigned long new_cl; 884 u32 pid = tcm->tcm_parent; 885 u32 clid = tcm->tcm_handle; 886 u32 qid = TC_H_MAJ(clid); 887 int err; 888 889 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 890 return -ENODEV; 891 892 /* 893 parent == TC_H_UNSPEC - unspecified parent. 894 parent == TC_H_ROOT - class is root, which has no parent. 895 parent == X:0 - parent is root class. 896 parent == X:Y - parent is a node in hierarchy. 897 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 898 899 handle == 0:0 - generate handle from kernel pool. 900 handle == 0:Y - class is X:Y, where X:0 is qdisc. 901 handle == X:Y - clear. 902 handle == X:0 - root class. 903 */ 904 905 /* Step 1. Determine qdisc handle X:0 */ 906 907 if (pid != TC_H_ROOT) { 908 u32 qid1 = TC_H_MAJ(pid); 909 910 if (qid && qid1) { 911 /* If both majors are known, they must be identical. */ 912 if (qid != qid1) 913 return -EINVAL; 914 } else if (qid1) { 915 qid = qid1; 916 } else if (qid == 0) 917 qid = dev->qdisc_sleeping->handle; 918 919 /* Now qid is genuine qdisc handle consistent 920 both with parent and child. 921 922 TC_H_MAJ(pid) still may be unspecified, complete it now. 923 */ 924 if (pid) 925 pid = TC_H_MAKE(qid, pid); 926 } else { 927 if (qid == 0) 928 qid = dev->qdisc_sleeping->handle; 929 } 930 931 /* OK. Locate qdisc */ 932 if ((q = qdisc_lookup(dev, qid)) == NULL) 933 return -ENOENT; 934 935 /* An check that it supports classes */ 936 cops = q->ops->cl_ops; 937 if (cops == NULL) 938 return -EINVAL; 939 940 /* Now try to get class */ 941 if (clid == 0) { 942 if (pid == TC_H_ROOT) 943 clid = qid; 944 } else 945 clid = TC_H_MAKE(qid, clid); 946 947 if (clid) 948 cl = cops->get(q, clid); 949 950 if (cl == 0) { 951 err = -ENOENT; 952 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) 953 goto out; 954 } else { 955 switch (n->nlmsg_type) { 956 case RTM_NEWTCLASS: 957 err = -EEXIST; 958 if (n->nlmsg_flags&NLM_F_EXCL) 959 goto out; 960 break; 961 case RTM_DELTCLASS: 962 err = cops->delete(q, cl); 963 if (err == 0) 964 tclass_notify(skb, n, q, cl, RTM_DELTCLASS); 965 goto out; 966 case RTM_GETTCLASS: 967 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); 968 goto out; 969 default: 970 err = -EINVAL; 971 goto out; 972 } 973 } 974 975 new_cl = cl; 976 err = cops->change(q, clid, pid, tca, &new_cl); 977 if (err == 0) 978 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); 979 980 out: 981 if (cl) 982 cops->put(q, cl); 983 984 return err; 985 } 986 987 988 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 989 unsigned long cl, 990 u32 pid, u32 seq, u16 flags, int event) 991 { 992 struct tcmsg *tcm; 993 struct nlmsghdr *nlh; 994 unsigned char *b = skb->tail; 995 struct gnet_dump d; 996 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; 997 998 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 999 tcm = NLMSG_DATA(nlh); 1000 tcm->tcm_family = AF_UNSPEC; 1001 tcm->tcm_ifindex = q->dev->ifindex; 1002 tcm->tcm_parent = q->handle; 1003 tcm->tcm_handle = q->handle; 1004 tcm->tcm_info = 0; 1005 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 1006 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) 1007 goto rtattr_failure; 1008 1009 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, 1010 TCA_XSTATS, q->stats_lock, &d) < 0) 1011 goto rtattr_failure; 1012 1013 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) 1014 goto rtattr_failure; 1015 1016 if (gnet_stats_finish_copy(&d) < 0) 1017 goto rtattr_failure; 1018 1019 nlh->nlmsg_len = skb->tail - b; 1020 return skb->len; 1021 1022 nlmsg_failure: 1023 rtattr_failure: 1024 skb_trim(skb, b - skb->data); 1025 return -1; 1026 } 1027 1028 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 1029 struct Qdisc *q, unsigned long cl, int event) 1030 { 1031 struct sk_buff *skb; 1032 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 1033 1034 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1035 if (!skb) 1036 return -ENOBUFS; 1037 1038 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { 1039 kfree_skb(skb); 1040 return -EINVAL; 1041 } 1042 1043 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 1044 } 1045 1046 struct qdisc_dump_args 1047 { 1048 struct qdisc_walker w; 1049 struct sk_buff *skb; 1050 struct netlink_callback *cb; 1051 }; 1052 1053 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1054 { 1055 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 1056 1057 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, 1058 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); 1059 } 1060 1061 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 1062 { 1063 int t; 1064 int s_t; 1065 struct net_device *dev; 1066 struct Qdisc *q; 1067 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); 1068 struct qdisc_dump_args arg; 1069 1070 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 1071 return 0; 1072 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) 1073 return 0; 1074 1075 s_t = cb->args[0]; 1076 t = 0; 1077 1078 read_lock_bh(&qdisc_tree_lock); 1079 list_for_each_entry(q, &dev->qdisc_list, list) { 1080 if (t < s_t || !q->ops->cl_ops || 1081 (tcm->tcm_parent && 1082 TC_H_MAJ(tcm->tcm_parent) != q->handle)) { 1083 t++; 1084 continue; 1085 } 1086 if (t > s_t) 1087 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 1088 arg.w.fn = qdisc_class_dump; 1089 arg.skb = skb; 1090 arg.cb = cb; 1091 arg.w.stop = 0; 1092 arg.w.skip = cb->args[1]; 1093 arg.w.count = 0; 1094 q->ops->cl_ops->walk(q, &arg.w); 1095 cb->args[1] = arg.w.count; 1096 if (arg.w.stop) 1097 break; 1098 t++; 1099 } 1100 read_unlock_bh(&qdisc_tree_lock); 1101 1102 cb->args[0] = t; 1103 1104 dev_put(dev); 1105 return skb->len; 1106 } 1107 1108 /* Main classifier routine: scans classifier chain attached 1109 to this qdisc, (optionally) tests for protocol and asks 1110 specific classifiers. 1111 */ 1112 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, 1113 struct tcf_result *res) 1114 { 1115 int err = 0; 1116 u32 protocol = skb->protocol; 1117 #ifdef CONFIG_NET_CLS_ACT 1118 struct tcf_proto *otp = tp; 1119 reclassify: 1120 #endif 1121 protocol = skb->protocol; 1122 1123 for ( ; tp; tp = tp->next) { 1124 if ((tp->protocol == protocol || 1125 tp->protocol == __constant_htons(ETH_P_ALL)) && 1126 (err = tp->classify(skb, tp, res)) >= 0) { 1127 #ifdef CONFIG_NET_CLS_ACT 1128 if ( TC_ACT_RECLASSIFY == err) { 1129 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); 1130 tp = otp; 1131 1132 if (MAX_REC_LOOP < verd++) { 1133 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", 1134 tp->prio&0xffff, ntohs(tp->protocol)); 1135 return TC_ACT_SHOT; 1136 } 1137 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); 1138 goto reclassify; 1139 } else { 1140 if (skb->tc_verd) 1141 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); 1142 return err; 1143 } 1144 #else 1145 1146 return err; 1147 #endif 1148 } 1149 1150 } 1151 return -1; 1152 } 1153 1154 static int psched_us_per_tick = 1; 1155 static int psched_tick_per_us = 1; 1156 1157 #ifdef CONFIG_PROC_FS 1158 static int psched_show(struct seq_file *seq, void *v) 1159 { 1160 seq_printf(seq, "%08x %08x %08x %08x\n", 1161 psched_tick_per_us, psched_us_per_tick, 1162 1000000, HZ); 1163 1164 return 0; 1165 } 1166 1167 static int psched_open(struct inode *inode, struct file *file) 1168 { 1169 return single_open(file, psched_show, PDE(inode)->data); 1170 } 1171 1172 static struct file_operations psched_fops = { 1173 .owner = THIS_MODULE, 1174 .open = psched_open, 1175 .read = seq_read, 1176 .llseek = seq_lseek, 1177 .release = single_release, 1178 }; 1179 #endif 1180 1181 #ifdef CONFIG_NET_SCH_CLK_CPU 1182 psched_tdiff_t psched_clock_per_hz; 1183 int psched_clock_scale; 1184 EXPORT_SYMBOL(psched_clock_per_hz); 1185 EXPORT_SYMBOL(psched_clock_scale); 1186 1187 psched_time_t psched_time_base; 1188 cycles_t psched_time_mark; 1189 EXPORT_SYMBOL(psched_time_mark); 1190 EXPORT_SYMBOL(psched_time_base); 1191 1192 /* 1193 * Periodically adjust psched_time_base to avoid overflow 1194 * with 32-bit get_cycles(). Safe up to 4GHz CPU. 1195 */ 1196 static void psched_tick(unsigned long); 1197 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0); 1198 1199 static void psched_tick(unsigned long dummy) 1200 { 1201 if (sizeof(cycles_t) == sizeof(u32)) { 1202 psched_time_t dummy_stamp; 1203 PSCHED_GET_TIME(dummy_stamp); 1204 psched_timer.expires = jiffies + 1*HZ; 1205 add_timer(&psched_timer); 1206 } 1207 } 1208 1209 int __init psched_calibrate_clock(void) 1210 { 1211 psched_time_t stamp, stamp1; 1212 struct timeval tv, tv1; 1213 psched_tdiff_t delay; 1214 long rdelay; 1215 unsigned long stop; 1216 1217 psched_tick(0); 1218 stop = jiffies + HZ/10; 1219 PSCHED_GET_TIME(stamp); 1220 do_gettimeofday(&tv); 1221 while (time_before(jiffies, stop)) { 1222 barrier(); 1223 cpu_relax(); 1224 } 1225 PSCHED_GET_TIME(stamp1); 1226 do_gettimeofday(&tv1); 1227 1228 delay = PSCHED_TDIFF(stamp1, stamp); 1229 rdelay = tv1.tv_usec - tv.tv_usec; 1230 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; 1231 if (rdelay > delay) 1232 return -1; 1233 delay /= rdelay; 1234 psched_tick_per_us = delay; 1235 while ((delay>>=1) != 0) 1236 psched_clock_scale++; 1237 psched_us_per_tick = 1<<psched_clock_scale; 1238 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; 1239 return 0; 1240 } 1241 #endif 1242 1243 static int __init pktsched_init(void) 1244 { 1245 struct rtnetlink_link *link_p; 1246 1247 #ifdef CONFIG_NET_SCH_CLK_CPU 1248 if (psched_calibrate_clock() < 0) 1249 return -1; 1250 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES) 1251 psched_tick_per_us = HZ<<PSCHED_JSCALE; 1252 psched_us_per_tick = 1000000; 1253 #endif 1254 1255 link_p = rtnetlink_links[PF_UNSPEC]; 1256 1257 /* Setup rtnetlink links. It is made here to avoid 1258 exporting large number of public symbols. 1259 */ 1260 1261 if (link_p) { 1262 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; 1263 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; 1264 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; 1265 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; 1266 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1267 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1268 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1269 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; 1270 } 1271 1272 register_qdisc(&pfifo_qdisc_ops); 1273 register_qdisc(&bfifo_qdisc_ops); 1274 proc_net_fops_create("psched", 0, &psched_fops); 1275 1276 return 0; 1277 } 1278 1279 subsys_initcall(pktsched_init); 1280 1281 EXPORT_SYMBOL(qdisc_lookup); 1282 EXPORT_SYMBOL(qdisc_get_rtab); 1283 EXPORT_SYMBOL(qdisc_put_rtab); 1284 EXPORT_SYMBOL(register_qdisc); 1285 EXPORT_SYMBOL(unregister_qdisc); 1286 EXPORT_SYMBOL(tc_classify); 1287