xref: /openbmc/linux/net/sched/sch_api.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/config.h>
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/sched.h>
23 #include <linux/string.h>
24 #include <linux/mm.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/in.h>
28 #include <linux/errno.h>
29 #include <linux/interrupt.h>
30 #include <linux/netdevice.h>
31 #include <linux/skbuff.h>
32 #include <linux/rtnetlink.h>
33 #include <linux/init.h>
34 #include <linux/proc_fs.h>
35 #include <linux/seq_file.h>
36 #include <linux/kmod.h>
37 #include <linux/list.h>
38 #include <linux/bitops.h>
39 
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 
43 #include <asm/processor.h>
44 #include <asm/uaccess.h>
45 #include <asm/system.h>
46 
47 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 			struct Qdisc *old, struct Qdisc *new);
49 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 			 struct Qdisc *q, unsigned long cl, int event);
51 
52 /*
53 
54    Short review.
55    -------------
56 
57    This file consists of two interrelated parts:
58 
59    1. queueing disciplines manager frontend.
60    2. traffic classes manager frontend.
61 
62    Generally, queueing discipline ("qdisc") is a black box,
63    which is able to enqueue packets and to dequeue them (when
64    device is ready to send something) in order and at times
65    determined by algorithm hidden in it.
66 
67    qdisc's are divided to two categories:
68    - "queues", which have no internal structure visible from outside.
69    - "schedulers", which split all the packets to "traffic classes",
70      using "packet classifiers" (look at cls_api.c)
71 
72    In turn, classes may have child qdiscs (as rule, queues)
73    attached to them etc. etc. etc.
74 
75    The goal of the routines in this file is to translate
76    information supplied by user in the form of handles
77    to more intelligible for kernel form, to make some sanity
78    checks and part of work, which is common to all qdiscs
79    and to provide rtnetlink notifications.
80 
81    All real intelligent work is done inside qdisc modules.
82 
83 
84 
85    Every discipline has two major routines: enqueue and dequeue.
86 
87    ---dequeue
88 
89    dequeue usually returns a skb to send. It is allowed to return NULL,
90    but it does not mean that queue is empty, it just means that
91    discipline does not want to send anything this time.
92    Queue is really empty if q->q.qlen == 0.
93    For complicated disciplines with multiple queues q->q is not
94    real packet queue, but however q->q.qlen must be valid.
95 
96    ---enqueue
97 
98    enqueue returns 0, if packet was enqueued successfully.
99    If packet (this one or another one) was dropped, it returns
100    not zero error code.
101    NET_XMIT_DROP 	- this packet dropped
102      Expected action: do not backoff, but wait until queue will clear.
103    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
104      Expected action: backoff or ignore
105    NET_XMIT_POLICED	- dropped by police.
106      Expected action: backoff or error to real-time apps.
107 
108    Auxiliary routines:
109 
110    ---requeue
111 
112    requeues once dequeued packet. It is used for non-standard or
113    just buggy devices, which can defer output even if dev->tbusy=0.
114 
115    ---reset
116 
117    returns qdisc to initial state: purge all buffers, clear all
118    timers, counters (except for statistics) etc.
119 
120    ---init
121 
122    initializes newly created qdisc.
123 
124    ---destroy
125 
126    destroys resources allocated by init and during lifetime of qdisc.
127 
128    ---change
129 
130    changes qdisc parameters.
131  */
132 
133 /* Protects list of registered TC modules. It is pure SMP lock. */
134 static DEFINE_RWLOCK(qdisc_mod_lock);
135 
136 
137 /************************************************
138  *	Queueing disciplines manipulation.	*
139  ************************************************/
140 
141 
142 /* The list of all installed queueing disciplines. */
143 
144 static struct Qdisc_ops *qdisc_base;
145 
146 /* Register/uregister queueing discipline */
147 
148 int register_qdisc(struct Qdisc_ops *qops)
149 {
150 	struct Qdisc_ops *q, **qp;
151 	int rc = -EEXIST;
152 
153 	write_lock(&qdisc_mod_lock);
154 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 		if (!strcmp(qops->id, q->id))
156 			goto out;
157 
158 	if (qops->enqueue == NULL)
159 		qops->enqueue = noop_qdisc_ops.enqueue;
160 	if (qops->requeue == NULL)
161 		qops->requeue = noop_qdisc_ops.requeue;
162 	if (qops->dequeue == NULL)
163 		qops->dequeue = noop_qdisc_ops.dequeue;
164 
165 	qops->next = NULL;
166 	*qp = qops;
167 	rc = 0;
168 out:
169 	write_unlock(&qdisc_mod_lock);
170 	return rc;
171 }
172 
173 int unregister_qdisc(struct Qdisc_ops *qops)
174 {
175 	struct Qdisc_ops *q, **qp;
176 	int err = -ENOENT;
177 
178 	write_lock(&qdisc_mod_lock);
179 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 		if (q == qops)
181 			break;
182 	if (q) {
183 		*qp = q->next;
184 		q->next = NULL;
185 		err = 0;
186 	}
187 	write_unlock(&qdisc_mod_lock);
188 	return err;
189 }
190 
191 /* We know handle. Find qdisc among all qdisc's attached to device
192    (root qdisc, all its children, children of children etc.)
193  */
194 
195 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196 {
197 	struct Qdisc *q;
198 
199 	read_lock_bh(&qdisc_tree_lock);
200 	list_for_each_entry(q, &dev->qdisc_list, list) {
201 		if (q->handle == handle) {
202 			read_unlock_bh(&qdisc_tree_lock);
203 			return q;
204 		}
205 	}
206 	read_unlock_bh(&qdisc_tree_lock);
207 	return NULL;
208 }
209 
210 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211 {
212 	unsigned long cl;
213 	struct Qdisc *leaf;
214 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
215 
216 	if (cops == NULL)
217 		return NULL;
218 	cl = cops->get(p, classid);
219 
220 	if (cl == 0)
221 		return NULL;
222 	leaf = cops->leaf(p, cl);
223 	cops->put(p, cl);
224 	return leaf;
225 }
226 
227 /* Find queueing discipline by name */
228 
229 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230 {
231 	struct Qdisc_ops *q = NULL;
232 
233 	if (kind) {
234 		read_lock(&qdisc_mod_lock);
235 		for (q = qdisc_base; q; q = q->next) {
236 			if (rtattr_strcmp(kind, q->id) == 0) {
237 				if (!try_module_get(q->owner))
238 					q = NULL;
239 				break;
240 			}
241 		}
242 		read_unlock(&qdisc_mod_lock);
243 	}
244 	return q;
245 }
246 
247 static struct qdisc_rate_table *qdisc_rtab_list;
248 
249 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250 {
251 	struct qdisc_rate_table *rtab;
252 
253 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 			rtab->refcnt++;
256 			return rtab;
257 		}
258 	}
259 
260 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 		return NULL;
262 
263 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 	if (rtab) {
265 		rtab->rate = *r;
266 		rtab->refcnt = 1;
267 		memcpy(rtab->data, RTA_DATA(tab), 1024);
268 		rtab->next = qdisc_rtab_list;
269 		qdisc_rtab_list = rtab;
270 	}
271 	return rtab;
272 }
273 
274 void qdisc_put_rtab(struct qdisc_rate_table *tab)
275 {
276 	struct qdisc_rate_table *rtab, **rtabp;
277 
278 	if (!tab || --tab->refcnt)
279 		return;
280 
281 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 		if (rtab == tab) {
283 			*rtabp = rtab->next;
284 			kfree(rtab);
285 			return;
286 		}
287 	}
288 }
289 
290 
291 /* Allocate an unique handle from space managed by kernel */
292 
293 static u32 qdisc_alloc_handle(struct net_device *dev)
294 {
295 	int i = 0x10000;
296 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297 
298 	do {
299 		autohandle += TC_H_MAKE(0x10000U, 0);
300 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 			autohandle = TC_H_MAKE(0x80000000U, 0);
302 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
303 
304 	return i>0 ? autohandle : 0;
305 }
306 
307 /* Attach toplevel qdisc to device dev */
308 
309 static struct Qdisc *
310 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311 {
312 	struct Qdisc *oqdisc;
313 
314 	if (dev->flags & IFF_UP)
315 		dev_deactivate(dev);
316 
317 	qdisc_lock_tree(dev);
318 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 		oqdisc = dev->qdisc_ingress;
320 		/* Prune old scheduler */
321 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 			/* delete */
323 			qdisc_reset(oqdisc);
324 			dev->qdisc_ingress = NULL;
325 		} else {  /* new */
326 			dev->qdisc_ingress = qdisc;
327 		}
328 
329 	} else {
330 
331 		oqdisc = dev->qdisc_sleeping;
332 
333 		/* Prune old scheduler */
334 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 			qdisc_reset(oqdisc);
336 
337 		/* ... and graft new one */
338 		if (qdisc == NULL)
339 			qdisc = &noop_qdisc;
340 		dev->qdisc_sleeping = qdisc;
341 		dev->qdisc = &noop_qdisc;
342 	}
343 
344 	qdisc_unlock_tree(dev);
345 
346 	if (dev->flags & IFF_UP)
347 		dev_activate(dev);
348 
349 	return oqdisc;
350 }
351 
352 
353 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
354    to device "dev".
355 
356    Old qdisc is not destroyed but returned in *old.
357  */
358 
359 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 		       u32 classid,
361 		       struct Qdisc *new, struct Qdisc **old)
362 {
363 	int err = 0;
364 	struct Qdisc *q = *old;
365 
366 
367 	if (parent == NULL) {
368 		if (q && q->flags&TCQ_F_INGRESS) {
369 			*old = dev_graft_qdisc(dev, q);
370 		} else {
371 			*old = dev_graft_qdisc(dev, new);
372 		}
373 	} else {
374 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375 
376 		err = -EINVAL;
377 
378 		if (cops) {
379 			unsigned long cl = cops->get(parent, classid);
380 			if (cl) {
381 				err = cops->graft(parent, cl, new, old);
382 				if (new)
383 					new->parent = classid;
384 				cops->put(parent, cl);
385 			}
386 		}
387 	}
388 	return err;
389 }
390 
391 /*
392    Allocate and initialize new qdisc.
393 
394    Parameters are passed via opt.
395  */
396 
397 static struct Qdisc *
398 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399 {
400 	int err;
401 	struct rtattr *kind = tca[TCA_KIND-1];
402 	struct Qdisc *sch;
403 	struct Qdisc_ops *ops;
404 
405 	ops = qdisc_lookup_ops(kind);
406 #ifdef CONFIG_KMOD
407 	if (ops == NULL && kind != NULL) {
408 		char name[IFNAMSIZ];
409 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
410 			/* We dropped the RTNL semaphore in order to
411 			 * perform the module load.  So, even if we
412 			 * succeeded in loading the module we have to
413 			 * tell the caller to replay the request.  We
414 			 * indicate this using -EAGAIN.
415 			 * We replay the request because the device may
416 			 * go away in the mean time.
417 			 */
418 			rtnl_unlock();
419 			request_module("sch_%s", name);
420 			rtnl_lock();
421 			ops = qdisc_lookup_ops(kind);
422 			if (ops != NULL) {
423 				/* We will try again qdisc_lookup_ops,
424 				 * so don't keep a reference.
425 				 */
426 				module_put(ops->owner);
427 				err = -EAGAIN;
428 				goto err_out;
429 			}
430 		}
431 	}
432 #endif
433 
434 	err = -EINVAL;
435 	if (ops == NULL)
436 		goto err_out;
437 
438 	sch = qdisc_alloc(dev, ops);
439 	if (IS_ERR(sch)) {
440 		err = PTR_ERR(sch);
441 		goto err_out2;
442 	}
443 
444 	if (handle == TC_H_INGRESS) {
445 		sch->flags |= TCQ_F_INGRESS;
446 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
447 	} else if (handle == 0) {
448 		handle = qdisc_alloc_handle(dev);
449 		err = -ENOMEM;
450 		if (handle == 0)
451 			goto err_out3;
452 	}
453 
454 	sch->handle = handle;
455 
456 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457 #ifdef CONFIG_NET_ESTIMATOR
458 		if (tca[TCA_RATE-1]) {
459 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 						sch->stats_lock,
461 						tca[TCA_RATE-1]);
462 			if (err) {
463 				/*
464 				 * Any broken qdiscs that would require
465 				 * a ops->reset() here? The qdisc was never
466 				 * in action so it shouldn't be necessary.
467 				 */
468 				if (ops->destroy)
469 					ops->destroy(sch);
470 				goto err_out3;
471 			}
472 		}
473 #endif
474 		qdisc_lock_tree(dev);
475 		list_add_tail(&sch->list, &dev->qdisc_list);
476 		qdisc_unlock_tree(dev);
477 
478 		return sch;
479 	}
480 err_out3:
481 	dev_put(dev);
482 	kfree((char *) sch - sch->padded);
483 err_out2:
484 	module_put(ops->owner);
485 err_out:
486 	*errp = err;
487 	return NULL;
488 }
489 
490 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
491 {
492 	if (tca[TCA_OPTIONS-1]) {
493 		int err;
494 
495 		if (sch->ops->change == NULL)
496 			return -EINVAL;
497 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
498 		if (err)
499 			return err;
500 	}
501 #ifdef CONFIG_NET_ESTIMATOR
502 	if (tca[TCA_RATE-1])
503 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
504 			sch->stats_lock, tca[TCA_RATE-1]);
505 #endif
506 	return 0;
507 }
508 
509 struct check_loop_arg
510 {
511 	struct qdisc_walker 	w;
512 	struct Qdisc		*p;
513 	int			depth;
514 };
515 
516 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
517 
518 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
519 {
520 	struct check_loop_arg	arg;
521 
522 	if (q->ops->cl_ops == NULL)
523 		return 0;
524 
525 	arg.w.stop = arg.w.skip = arg.w.count = 0;
526 	arg.w.fn = check_loop_fn;
527 	arg.depth = depth;
528 	arg.p = p;
529 	q->ops->cl_ops->walk(q, &arg.w);
530 	return arg.w.stop ? -ELOOP : 0;
531 }
532 
533 static int
534 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
535 {
536 	struct Qdisc *leaf;
537 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
538 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
539 
540 	leaf = cops->leaf(q, cl);
541 	if (leaf) {
542 		if (leaf == arg->p || arg->depth > 7)
543 			return -ELOOP;
544 		return check_loop(leaf, arg->p, arg->depth + 1);
545 	}
546 	return 0;
547 }
548 
549 /*
550  * Delete/get qdisc.
551  */
552 
553 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
554 {
555 	struct tcmsg *tcm = NLMSG_DATA(n);
556 	struct rtattr **tca = arg;
557 	struct net_device *dev;
558 	u32 clid = tcm->tcm_parent;
559 	struct Qdisc *q = NULL;
560 	struct Qdisc *p = NULL;
561 	int err;
562 
563 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
564 		return -ENODEV;
565 
566 	if (clid) {
567 		if (clid != TC_H_ROOT) {
568 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
569 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
570 					return -ENOENT;
571 				q = qdisc_leaf(p, clid);
572 			} else { /* ingress */
573 				q = dev->qdisc_ingress;
574                         }
575 		} else {
576 			q = dev->qdisc_sleeping;
577 		}
578 		if (!q)
579 			return -ENOENT;
580 
581 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
582 			return -EINVAL;
583 	} else {
584 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
585 			return -ENOENT;
586 	}
587 
588 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
589 		return -EINVAL;
590 
591 	if (n->nlmsg_type == RTM_DELQDISC) {
592 		if (!clid)
593 			return -EINVAL;
594 		if (q->handle == 0)
595 			return -ENOENT;
596 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
597 			return err;
598 		if (q) {
599 			qdisc_notify(skb, n, clid, q, NULL);
600 			spin_lock_bh(&dev->queue_lock);
601 			qdisc_destroy(q);
602 			spin_unlock_bh(&dev->queue_lock);
603 		}
604 	} else {
605 		qdisc_notify(skb, n, clid, NULL, q);
606 	}
607 	return 0;
608 }
609 
610 /*
611    Create/change qdisc.
612  */
613 
614 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615 {
616 	struct tcmsg *tcm;
617 	struct rtattr **tca;
618 	struct net_device *dev;
619 	u32 clid;
620 	struct Qdisc *q, *p;
621 	int err;
622 
623 replay:
624 	/* Reinit, just in case something touches this. */
625 	tcm = NLMSG_DATA(n);
626 	tca = arg;
627 	clid = tcm->tcm_parent;
628 	q = p = NULL;
629 
630 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
631 		return -ENODEV;
632 
633 	if (clid) {
634 		if (clid != TC_H_ROOT) {
635 			if (clid != TC_H_INGRESS) {
636 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
637 					return -ENOENT;
638 				q = qdisc_leaf(p, clid);
639 			} else { /*ingress */
640 				q = dev->qdisc_ingress;
641 			}
642 		} else {
643 			q = dev->qdisc_sleeping;
644 		}
645 
646 		/* It may be default qdisc, ignore it */
647 		if (q && q->handle == 0)
648 			q = NULL;
649 
650 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
651 			if (tcm->tcm_handle) {
652 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
653 					return -EEXIST;
654 				if (TC_H_MIN(tcm->tcm_handle))
655 					return -EINVAL;
656 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
657 					goto create_n_graft;
658 				if (n->nlmsg_flags&NLM_F_EXCL)
659 					return -EEXIST;
660 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
661 					return -EINVAL;
662 				if (q == p ||
663 				    (p && check_loop(q, p, 0)))
664 					return -ELOOP;
665 				atomic_inc(&q->refcnt);
666 				goto graft;
667 			} else {
668 				if (q == NULL)
669 					goto create_n_graft;
670 
671 				/* This magic test requires explanation.
672 				 *
673 				 *   We know, that some child q is already
674 				 *   attached to this parent and have choice:
675 				 *   either to change it or to create/graft new one.
676 				 *
677 				 *   1. We are allowed to create/graft only
678 				 *   if CREATE and REPLACE flags are set.
679 				 *
680 				 *   2. If EXCL is set, requestor wanted to say,
681 				 *   that qdisc tcm_handle is not expected
682 				 *   to exist, so that we choose create/graft too.
683 				 *
684 				 *   3. The last case is when no flags are set.
685 				 *   Alas, it is sort of hole in API, we
686 				 *   cannot decide what to do unambiguously.
687 				 *   For now we select create/graft, if
688 				 *   user gave KIND, which does not match existing.
689 				 */
690 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
691 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
692 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
693 				     (tca[TCA_KIND-1] &&
694 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
695 					goto create_n_graft;
696 			}
697 		}
698 	} else {
699 		if (!tcm->tcm_handle)
700 			return -EINVAL;
701 		q = qdisc_lookup(dev, tcm->tcm_handle);
702 	}
703 
704 	/* Change qdisc parameters */
705 	if (q == NULL)
706 		return -ENOENT;
707 	if (n->nlmsg_flags&NLM_F_EXCL)
708 		return -EEXIST;
709 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
710 		return -EINVAL;
711 	err = qdisc_change(q, tca);
712 	if (err == 0)
713 		qdisc_notify(skb, n, clid, NULL, q);
714 	return err;
715 
716 create_n_graft:
717 	if (!(n->nlmsg_flags&NLM_F_CREATE))
718 		return -ENOENT;
719 	if (clid == TC_H_INGRESS)
720 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
721         else
722 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
723 	if (q == NULL) {
724 		if (err == -EAGAIN)
725 			goto replay;
726 		return err;
727 	}
728 
729 graft:
730 	if (1) {
731 		struct Qdisc *old_q = NULL;
732 		err = qdisc_graft(dev, p, clid, q, &old_q);
733 		if (err) {
734 			if (q) {
735 				spin_lock_bh(&dev->queue_lock);
736 				qdisc_destroy(q);
737 				spin_unlock_bh(&dev->queue_lock);
738 			}
739 			return err;
740 		}
741 		qdisc_notify(skb, n, clid, old_q, q);
742 		if (old_q) {
743 			spin_lock_bh(&dev->queue_lock);
744 			qdisc_destroy(old_q);
745 			spin_unlock_bh(&dev->queue_lock);
746 		}
747 	}
748 	return 0;
749 }
750 
751 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
752 			 u32 pid, u32 seq, u16 flags, int event)
753 {
754 	struct tcmsg *tcm;
755 	struct nlmsghdr  *nlh;
756 	unsigned char	 *b = skb->tail;
757 	struct gnet_dump d;
758 
759 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
760 	tcm = NLMSG_DATA(nlh);
761 	tcm->tcm_family = AF_UNSPEC;
762 	tcm->tcm__pad1 = 0;
763 	tcm->tcm__pad2 = 0;
764 	tcm->tcm_ifindex = q->dev->ifindex;
765 	tcm->tcm_parent = clid;
766 	tcm->tcm_handle = q->handle;
767 	tcm->tcm_info = atomic_read(&q->refcnt);
768 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
769 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
770 		goto rtattr_failure;
771 	q->qstats.qlen = q->q.qlen;
772 
773 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
774 			TCA_XSTATS, q->stats_lock, &d) < 0)
775 		goto rtattr_failure;
776 
777 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
778 		goto rtattr_failure;
779 
780 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
781 #ifdef CONFIG_NET_ESTIMATOR
782 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
783 #endif
784 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
785 		goto rtattr_failure;
786 
787 	if (gnet_stats_finish_copy(&d) < 0)
788 		goto rtattr_failure;
789 
790 	nlh->nlmsg_len = skb->tail - b;
791 	return skb->len;
792 
793 nlmsg_failure:
794 rtattr_failure:
795 	skb_trim(skb, b - skb->data);
796 	return -1;
797 }
798 
799 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
800 			u32 clid, struct Qdisc *old, struct Qdisc *new)
801 {
802 	struct sk_buff *skb;
803 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
804 
805 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
806 	if (!skb)
807 		return -ENOBUFS;
808 
809 	if (old && old->handle) {
810 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
811 			goto err_out;
812 	}
813 	if (new) {
814 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
815 			goto err_out;
816 	}
817 
818 	if (skb->len)
819 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
820 
821 err_out:
822 	kfree_skb(skb);
823 	return -EINVAL;
824 }
825 
826 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
827 {
828 	int idx, q_idx;
829 	int s_idx, s_q_idx;
830 	struct net_device *dev;
831 	struct Qdisc *q;
832 
833 	s_idx = cb->args[0];
834 	s_q_idx = q_idx = cb->args[1];
835 	read_lock(&dev_base_lock);
836 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
837 		if (idx < s_idx)
838 			continue;
839 		if (idx > s_idx)
840 			s_q_idx = 0;
841 		read_lock_bh(&qdisc_tree_lock);
842 		q_idx = 0;
843 		list_for_each_entry(q, &dev->qdisc_list, list) {
844 			if (q_idx < s_q_idx) {
845 				q_idx++;
846 				continue;
847 			}
848 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
849 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
850 				read_unlock_bh(&qdisc_tree_lock);
851 				goto done;
852 			}
853 			q_idx++;
854 		}
855 		read_unlock_bh(&qdisc_tree_lock);
856 	}
857 
858 done:
859 	read_unlock(&dev_base_lock);
860 
861 	cb->args[0] = idx;
862 	cb->args[1] = q_idx;
863 
864 	return skb->len;
865 }
866 
867 
868 
869 /************************************************
870  *	Traffic classes manipulation.		*
871  ************************************************/
872 
873 
874 
875 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
876 {
877 	struct tcmsg *tcm = NLMSG_DATA(n);
878 	struct rtattr **tca = arg;
879 	struct net_device *dev;
880 	struct Qdisc *q = NULL;
881 	struct Qdisc_class_ops *cops;
882 	unsigned long cl = 0;
883 	unsigned long new_cl;
884 	u32 pid = tcm->tcm_parent;
885 	u32 clid = tcm->tcm_handle;
886 	u32 qid = TC_H_MAJ(clid);
887 	int err;
888 
889 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
890 		return -ENODEV;
891 
892 	/*
893 	   parent == TC_H_UNSPEC - unspecified parent.
894 	   parent == TC_H_ROOT   - class is root, which has no parent.
895 	   parent == X:0	 - parent is root class.
896 	   parent == X:Y	 - parent is a node in hierarchy.
897 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
898 
899 	   handle == 0:0	 - generate handle from kernel pool.
900 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
901 	   handle == X:Y	 - clear.
902 	   handle == X:0	 - root class.
903 	 */
904 
905 	/* Step 1. Determine qdisc handle X:0 */
906 
907 	if (pid != TC_H_ROOT) {
908 		u32 qid1 = TC_H_MAJ(pid);
909 
910 		if (qid && qid1) {
911 			/* If both majors are known, they must be identical. */
912 			if (qid != qid1)
913 				return -EINVAL;
914 		} else if (qid1) {
915 			qid = qid1;
916 		} else if (qid == 0)
917 			qid = dev->qdisc_sleeping->handle;
918 
919 		/* Now qid is genuine qdisc handle consistent
920 		   both with parent and child.
921 
922 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
923 		 */
924 		if (pid)
925 			pid = TC_H_MAKE(qid, pid);
926 	} else {
927 		if (qid == 0)
928 			qid = dev->qdisc_sleeping->handle;
929 	}
930 
931 	/* OK. Locate qdisc */
932 	if ((q = qdisc_lookup(dev, qid)) == NULL)
933 		return -ENOENT;
934 
935 	/* An check that it supports classes */
936 	cops = q->ops->cl_ops;
937 	if (cops == NULL)
938 		return -EINVAL;
939 
940 	/* Now try to get class */
941 	if (clid == 0) {
942 		if (pid == TC_H_ROOT)
943 			clid = qid;
944 	} else
945 		clid = TC_H_MAKE(qid, clid);
946 
947 	if (clid)
948 		cl = cops->get(q, clid);
949 
950 	if (cl == 0) {
951 		err = -ENOENT;
952 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
953 			goto out;
954 	} else {
955 		switch (n->nlmsg_type) {
956 		case RTM_NEWTCLASS:
957 			err = -EEXIST;
958 			if (n->nlmsg_flags&NLM_F_EXCL)
959 				goto out;
960 			break;
961 		case RTM_DELTCLASS:
962 			err = cops->delete(q, cl);
963 			if (err == 0)
964 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
965 			goto out;
966 		case RTM_GETTCLASS:
967 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
968 			goto out;
969 		default:
970 			err = -EINVAL;
971 			goto out;
972 		}
973 	}
974 
975 	new_cl = cl;
976 	err = cops->change(q, clid, pid, tca, &new_cl);
977 	if (err == 0)
978 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
979 
980 out:
981 	if (cl)
982 		cops->put(q, cl);
983 
984 	return err;
985 }
986 
987 
988 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
989 			  unsigned long cl,
990 			  u32 pid, u32 seq, u16 flags, int event)
991 {
992 	struct tcmsg *tcm;
993 	struct nlmsghdr  *nlh;
994 	unsigned char	 *b = skb->tail;
995 	struct gnet_dump d;
996 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
997 
998 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
999 	tcm = NLMSG_DATA(nlh);
1000 	tcm->tcm_family = AF_UNSPEC;
1001 	tcm->tcm_ifindex = q->dev->ifindex;
1002 	tcm->tcm_parent = q->handle;
1003 	tcm->tcm_handle = q->handle;
1004 	tcm->tcm_info = 0;
1005 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1006 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1007 		goto rtattr_failure;
1008 
1009 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1010 			TCA_XSTATS, q->stats_lock, &d) < 0)
1011 		goto rtattr_failure;
1012 
1013 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1014 		goto rtattr_failure;
1015 
1016 	if (gnet_stats_finish_copy(&d) < 0)
1017 		goto rtattr_failure;
1018 
1019 	nlh->nlmsg_len = skb->tail - b;
1020 	return skb->len;
1021 
1022 nlmsg_failure:
1023 rtattr_failure:
1024 	skb_trim(skb, b - skb->data);
1025 	return -1;
1026 }
1027 
1028 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1029 			  struct Qdisc *q, unsigned long cl, int event)
1030 {
1031 	struct sk_buff *skb;
1032 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1033 
1034 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1035 	if (!skb)
1036 		return -ENOBUFS;
1037 
1038 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1039 		kfree_skb(skb);
1040 		return -EINVAL;
1041 	}
1042 
1043 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1044 }
1045 
1046 struct qdisc_dump_args
1047 {
1048 	struct qdisc_walker w;
1049 	struct sk_buff *skb;
1050 	struct netlink_callback *cb;
1051 };
1052 
1053 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1054 {
1055 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1056 
1057 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1058 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1059 }
1060 
1061 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1062 {
1063 	int t;
1064 	int s_t;
1065 	struct net_device *dev;
1066 	struct Qdisc *q;
1067 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1068 	struct qdisc_dump_args arg;
1069 
1070 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1071 		return 0;
1072 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1073 		return 0;
1074 
1075 	s_t = cb->args[0];
1076 	t = 0;
1077 
1078 	read_lock_bh(&qdisc_tree_lock);
1079 	list_for_each_entry(q, &dev->qdisc_list, list) {
1080 		if (t < s_t || !q->ops->cl_ops ||
1081 		    (tcm->tcm_parent &&
1082 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1083 			t++;
1084 			continue;
1085 		}
1086 		if (t > s_t)
1087 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1088 		arg.w.fn = qdisc_class_dump;
1089 		arg.skb = skb;
1090 		arg.cb = cb;
1091 		arg.w.stop  = 0;
1092 		arg.w.skip = cb->args[1];
1093 		arg.w.count = 0;
1094 		q->ops->cl_ops->walk(q, &arg.w);
1095 		cb->args[1] = arg.w.count;
1096 		if (arg.w.stop)
1097 			break;
1098 		t++;
1099 	}
1100 	read_unlock_bh(&qdisc_tree_lock);
1101 
1102 	cb->args[0] = t;
1103 
1104 	dev_put(dev);
1105 	return skb->len;
1106 }
1107 
1108 /* Main classifier routine: scans classifier chain attached
1109    to this qdisc, (optionally) tests for protocol and asks
1110    specific classifiers.
1111  */
1112 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1113 	struct tcf_result *res)
1114 {
1115 	int err = 0;
1116 	u32 protocol = skb->protocol;
1117 #ifdef CONFIG_NET_CLS_ACT
1118 	struct tcf_proto *otp = tp;
1119 reclassify:
1120 #endif
1121 	protocol = skb->protocol;
1122 
1123 	for ( ; tp; tp = tp->next) {
1124 		if ((tp->protocol == protocol ||
1125 			tp->protocol == __constant_htons(ETH_P_ALL)) &&
1126 			(err = tp->classify(skb, tp, res)) >= 0) {
1127 #ifdef CONFIG_NET_CLS_ACT
1128 			if ( TC_ACT_RECLASSIFY == err) {
1129 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1130 				tp = otp;
1131 
1132 				if (MAX_REC_LOOP < verd++) {
1133 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1134 						tp->prio&0xffff, ntohs(tp->protocol));
1135 					return TC_ACT_SHOT;
1136 				}
1137 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1138 				goto reclassify;
1139 			} else {
1140 				if (skb->tc_verd)
1141 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1142 				return err;
1143 			}
1144 #else
1145 
1146 			return err;
1147 #endif
1148 		}
1149 
1150 	}
1151 	return -1;
1152 }
1153 
1154 static int psched_us_per_tick = 1;
1155 static int psched_tick_per_us = 1;
1156 
1157 #ifdef CONFIG_PROC_FS
1158 static int psched_show(struct seq_file *seq, void *v)
1159 {
1160 	seq_printf(seq, "%08x %08x %08x %08x\n",
1161 		      psched_tick_per_us, psched_us_per_tick,
1162 		      1000000, HZ);
1163 
1164 	return 0;
1165 }
1166 
1167 static int psched_open(struct inode *inode, struct file *file)
1168 {
1169 	return single_open(file, psched_show, PDE(inode)->data);
1170 }
1171 
1172 static struct file_operations psched_fops = {
1173 	.owner = THIS_MODULE,
1174 	.open = psched_open,
1175 	.read  = seq_read,
1176 	.llseek = seq_lseek,
1177 	.release = single_release,
1178 };
1179 #endif
1180 
1181 #ifdef CONFIG_NET_SCH_CLK_CPU
1182 psched_tdiff_t psched_clock_per_hz;
1183 int psched_clock_scale;
1184 EXPORT_SYMBOL(psched_clock_per_hz);
1185 EXPORT_SYMBOL(psched_clock_scale);
1186 
1187 psched_time_t psched_time_base;
1188 cycles_t psched_time_mark;
1189 EXPORT_SYMBOL(psched_time_mark);
1190 EXPORT_SYMBOL(psched_time_base);
1191 
1192 /*
1193  * Periodically adjust psched_time_base to avoid overflow
1194  * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1195  */
1196 static void psched_tick(unsigned long);
1197 static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
1198 
1199 static void psched_tick(unsigned long dummy)
1200 {
1201 	if (sizeof(cycles_t) == sizeof(u32)) {
1202 		psched_time_t dummy_stamp;
1203 		PSCHED_GET_TIME(dummy_stamp);
1204 		psched_timer.expires = jiffies + 1*HZ;
1205 		add_timer(&psched_timer);
1206 	}
1207 }
1208 
1209 int __init psched_calibrate_clock(void)
1210 {
1211 	psched_time_t stamp, stamp1;
1212 	struct timeval tv, tv1;
1213 	psched_tdiff_t delay;
1214 	long rdelay;
1215 	unsigned long stop;
1216 
1217 	psched_tick(0);
1218 	stop = jiffies + HZ/10;
1219 	PSCHED_GET_TIME(stamp);
1220 	do_gettimeofday(&tv);
1221 	while (time_before(jiffies, stop)) {
1222 		barrier();
1223 		cpu_relax();
1224 	}
1225 	PSCHED_GET_TIME(stamp1);
1226 	do_gettimeofday(&tv1);
1227 
1228 	delay = PSCHED_TDIFF(stamp1, stamp);
1229 	rdelay = tv1.tv_usec - tv.tv_usec;
1230 	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1231 	if (rdelay > delay)
1232 		return -1;
1233 	delay /= rdelay;
1234 	psched_tick_per_us = delay;
1235 	while ((delay>>=1) != 0)
1236 		psched_clock_scale++;
1237 	psched_us_per_tick = 1<<psched_clock_scale;
1238 	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1239 	return 0;
1240 }
1241 #endif
1242 
1243 static int __init pktsched_init(void)
1244 {
1245 	struct rtnetlink_link *link_p;
1246 
1247 #ifdef CONFIG_NET_SCH_CLK_CPU
1248 	if (psched_calibrate_clock() < 0)
1249 		return -1;
1250 #elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1251 	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1252 	psched_us_per_tick = 1000000;
1253 #endif
1254 
1255 	link_p = rtnetlink_links[PF_UNSPEC];
1256 
1257 	/* Setup rtnetlink links. It is made here to avoid
1258 	   exporting large number of public symbols.
1259 	 */
1260 
1261 	if (link_p) {
1262 		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1263 		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1265 		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1266 		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1269 		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1270 	}
1271 
1272 	register_qdisc(&pfifo_qdisc_ops);
1273 	register_qdisc(&bfifo_qdisc_ops);
1274 	proc_net_fops_create("psched", 0, &psched_fops);
1275 
1276 	return 0;
1277 }
1278 
1279 subsys_initcall(pktsched_init);
1280 
1281 EXPORT_SYMBOL(qdisc_lookup);
1282 EXPORT_SYMBOL(qdisc_get_rtab);
1283 EXPORT_SYMBOL(qdisc_put_rtab);
1284 EXPORT_SYMBOL(register_qdisc);
1285 EXPORT_SYMBOL(unregister_qdisc);
1286 EXPORT_SYMBOL(tc_classify);
1287