xref: /openbmc/linux/net/sched/sch_api.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 
31 #include <net/net_namespace.h>
32 #include <net/netlink.h>
33 #include <net/pkt_sched.h>
34 
35 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
36 			struct Qdisc *old, struct Qdisc *new);
37 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
38 			 struct Qdisc *q, unsigned long cl, int event);
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93    NET_XMIT_POLICED	- dropped by police.
94      Expected action: backoff or error to real-time apps.
95 
96    Auxiliary routines:
97 
98    ---requeue
99 
100    requeues once dequeued packet. It is used for non-standard or
101    just buggy devices, which can defer output even if dev->tbusy=0.
102 
103    ---reset
104 
105    returns qdisc to initial state: purge all buffers, clear all
106    timers, counters (except for statistics) etc.
107 
108    ---init
109 
110    initializes newly created qdisc.
111 
112    ---destroy
113 
114    destroys resources allocated by init and during lifetime of qdisc.
115 
116    ---change
117 
118    changes qdisc parameters.
119  */
120 
121 /* Protects list of registered TC modules. It is pure SMP lock. */
122 static DEFINE_RWLOCK(qdisc_mod_lock);
123 
124 
125 /************************************************
126  *	Queueing disciplines manipulation.	*
127  ************************************************/
128 
129 
130 /* The list of all installed queueing disciplines. */
131 
132 static struct Qdisc_ops *qdisc_base;
133 
134 /* Register/uregister queueing discipline */
135 
136 int register_qdisc(struct Qdisc_ops *qops)
137 {
138 	struct Qdisc_ops *q, **qp;
139 	int rc = -EEXIST;
140 
141 	write_lock(&qdisc_mod_lock);
142 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
143 		if (!strcmp(qops->id, q->id))
144 			goto out;
145 
146 	if (qops->enqueue == NULL)
147 		qops->enqueue = noop_qdisc_ops.enqueue;
148 	if (qops->requeue == NULL)
149 		qops->requeue = noop_qdisc_ops.requeue;
150 	if (qops->dequeue == NULL)
151 		qops->dequeue = noop_qdisc_ops.dequeue;
152 
153 	qops->next = NULL;
154 	*qp = qops;
155 	rc = 0;
156 out:
157 	write_unlock(&qdisc_mod_lock);
158 	return rc;
159 }
160 
161 int unregister_qdisc(struct Qdisc_ops *qops)
162 {
163 	struct Qdisc_ops *q, **qp;
164 	int err = -ENOENT;
165 
166 	write_lock(&qdisc_mod_lock);
167 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
168 		if (q == qops)
169 			break;
170 	if (q) {
171 		*qp = q->next;
172 		q->next = NULL;
173 		err = 0;
174 	}
175 	write_unlock(&qdisc_mod_lock);
176 	return err;
177 }
178 
179 /* We know handle. Find qdisc among all qdisc's attached to device
180    (root qdisc, all its children, children of children etc.)
181  */
182 
183 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
184 {
185 	struct Qdisc *q;
186 
187 	list_for_each_entry(q, &dev->qdisc_list, list) {
188 		if (q->handle == handle)
189 			return q;
190 	}
191 	return NULL;
192 }
193 
194 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
195 {
196 	unsigned long cl;
197 	struct Qdisc *leaf;
198 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
199 
200 	if (cops == NULL)
201 		return NULL;
202 	cl = cops->get(p, classid);
203 
204 	if (cl == 0)
205 		return NULL;
206 	leaf = cops->leaf(p, cl);
207 	cops->put(p, cl);
208 	return leaf;
209 }
210 
211 /* Find queueing discipline by name */
212 
213 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
214 {
215 	struct Qdisc_ops *q = NULL;
216 
217 	if (kind) {
218 		read_lock(&qdisc_mod_lock);
219 		for (q = qdisc_base; q; q = q->next) {
220 			if (rtattr_strcmp(kind, q->id) == 0) {
221 				if (!try_module_get(q->owner))
222 					q = NULL;
223 				break;
224 			}
225 		}
226 		read_unlock(&qdisc_mod_lock);
227 	}
228 	return q;
229 }
230 
231 static struct qdisc_rate_table *qdisc_rtab_list;
232 
233 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
234 {
235 	struct qdisc_rate_table *rtab;
236 
237 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
238 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
239 			rtab->refcnt++;
240 			return rtab;
241 		}
242 	}
243 
244 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
245 		return NULL;
246 
247 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
248 	if (rtab) {
249 		rtab->rate = *r;
250 		rtab->refcnt = 1;
251 		memcpy(rtab->data, RTA_DATA(tab), 1024);
252 		rtab->next = qdisc_rtab_list;
253 		qdisc_rtab_list = rtab;
254 	}
255 	return rtab;
256 }
257 
258 void qdisc_put_rtab(struct qdisc_rate_table *tab)
259 {
260 	struct qdisc_rate_table *rtab, **rtabp;
261 
262 	if (!tab || --tab->refcnt)
263 		return;
264 
265 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
266 		if (rtab == tab) {
267 			*rtabp = rtab->next;
268 			kfree(rtab);
269 			return;
270 		}
271 	}
272 }
273 
274 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
275 {
276 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
277 						 timer);
278 	struct net_device *dev = wd->qdisc->dev;
279 
280 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
281 	smp_wmb();
282 	netif_schedule(dev);
283 
284 	return HRTIMER_NORESTART;
285 }
286 
287 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
288 {
289 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
290 	wd->timer.function = qdisc_watchdog;
291 	wd->qdisc = qdisc;
292 }
293 EXPORT_SYMBOL(qdisc_watchdog_init);
294 
295 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
296 {
297 	ktime_t time;
298 
299 	wd->qdisc->flags |= TCQ_F_THROTTLED;
300 	time = ktime_set(0, 0);
301 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
302 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
303 }
304 EXPORT_SYMBOL(qdisc_watchdog_schedule);
305 
306 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
307 {
308 	hrtimer_cancel(&wd->timer);
309 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
310 }
311 EXPORT_SYMBOL(qdisc_watchdog_cancel);
312 
313 /* Allocate an unique handle from space managed by kernel */
314 
315 static u32 qdisc_alloc_handle(struct net_device *dev)
316 {
317 	int i = 0x10000;
318 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
319 
320 	do {
321 		autohandle += TC_H_MAKE(0x10000U, 0);
322 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
323 			autohandle = TC_H_MAKE(0x80000000U, 0);
324 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
325 
326 	return i>0 ? autohandle : 0;
327 }
328 
329 /* Attach toplevel qdisc to device dev */
330 
331 static struct Qdisc *
332 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
333 {
334 	struct Qdisc *oqdisc;
335 
336 	if (dev->flags & IFF_UP)
337 		dev_deactivate(dev);
338 
339 	qdisc_lock_tree(dev);
340 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
341 		oqdisc = dev->qdisc_ingress;
342 		/* Prune old scheduler */
343 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
344 			/* delete */
345 			qdisc_reset(oqdisc);
346 			dev->qdisc_ingress = NULL;
347 		} else {  /* new */
348 			dev->qdisc_ingress = qdisc;
349 		}
350 
351 	} else {
352 
353 		oqdisc = dev->qdisc_sleeping;
354 
355 		/* Prune old scheduler */
356 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
357 			qdisc_reset(oqdisc);
358 
359 		/* ... and graft new one */
360 		if (qdisc == NULL)
361 			qdisc = &noop_qdisc;
362 		dev->qdisc_sleeping = qdisc;
363 		dev->qdisc = &noop_qdisc;
364 	}
365 
366 	qdisc_unlock_tree(dev);
367 
368 	if (dev->flags & IFF_UP)
369 		dev_activate(dev);
370 
371 	return oqdisc;
372 }
373 
374 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
375 {
376 	struct Qdisc_class_ops *cops;
377 	unsigned long cl;
378 	u32 parentid;
379 
380 	if (n == 0)
381 		return;
382 	while ((parentid = sch->parent)) {
383 		sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
384 		if (sch == NULL) {
385 			WARN_ON(parentid != TC_H_ROOT);
386 			return;
387 		}
388 		cops = sch->ops->cl_ops;
389 		if (cops->qlen_notify) {
390 			cl = cops->get(sch, parentid);
391 			cops->qlen_notify(sch, cl);
392 			cops->put(sch, cl);
393 		}
394 		sch->q.qlen -= n;
395 	}
396 }
397 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
398 
399 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
400    to device "dev".
401 
402    Old qdisc is not destroyed but returned in *old.
403  */
404 
405 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
406 		       u32 classid,
407 		       struct Qdisc *new, struct Qdisc **old)
408 {
409 	int err = 0;
410 	struct Qdisc *q = *old;
411 
412 
413 	if (parent == NULL) {
414 		if (q && q->flags&TCQ_F_INGRESS) {
415 			*old = dev_graft_qdisc(dev, q);
416 		} else {
417 			*old = dev_graft_qdisc(dev, new);
418 		}
419 	} else {
420 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
421 
422 		err = -EINVAL;
423 
424 		if (cops) {
425 			unsigned long cl = cops->get(parent, classid);
426 			if (cl) {
427 				err = cops->graft(parent, cl, new, old);
428 				cops->put(parent, cl);
429 			}
430 		}
431 	}
432 	return err;
433 }
434 
435 /*
436    Allocate and initialize new qdisc.
437 
438    Parameters are passed via opt.
439  */
440 
441 static struct Qdisc *
442 qdisc_create(struct net_device *dev, u32 parent, u32 handle,
443 	   struct rtattr **tca, int *errp)
444 {
445 	int err;
446 	struct rtattr *kind = tca[TCA_KIND-1];
447 	struct Qdisc *sch;
448 	struct Qdisc_ops *ops;
449 
450 	ops = qdisc_lookup_ops(kind);
451 #ifdef CONFIG_KMOD
452 	if (ops == NULL && kind != NULL) {
453 		char name[IFNAMSIZ];
454 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
455 			/* We dropped the RTNL semaphore in order to
456 			 * perform the module load.  So, even if we
457 			 * succeeded in loading the module we have to
458 			 * tell the caller to replay the request.  We
459 			 * indicate this using -EAGAIN.
460 			 * We replay the request because the device may
461 			 * go away in the mean time.
462 			 */
463 			rtnl_unlock();
464 			request_module("sch_%s", name);
465 			rtnl_lock();
466 			ops = qdisc_lookup_ops(kind);
467 			if (ops != NULL) {
468 				/* We will try again qdisc_lookup_ops,
469 				 * so don't keep a reference.
470 				 */
471 				module_put(ops->owner);
472 				err = -EAGAIN;
473 				goto err_out;
474 			}
475 		}
476 	}
477 #endif
478 
479 	err = -ENOENT;
480 	if (ops == NULL)
481 		goto err_out;
482 
483 	sch = qdisc_alloc(dev, ops);
484 	if (IS_ERR(sch)) {
485 		err = PTR_ERR(sch);
486 		goto err_out2;
487 	}
488 
489 	sch->parent = parent;
490 
491 	if (handle == TC_H_INGRESS) {
492 		sch->flags |= TCQ_F_INGRESS;
493 		sch->stats_lock = &dev->ingress_lock;
494 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
495 	} else {
496 		sch->stats_lock = &dev->queue_lock;
497 		if (handle == 0) {
498 			handle = qdisc_alloc_handle(dev);
499 			err = -ENOMEM;
500 			if (handle == 0)
501 				goto err_out3;
502 		}
503 	}
504 
505 	sch->handle = handle;
506 
507 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
508 		if (tca[TCA_RATE-1]) {
509 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
510 						sch->stats_lock,
511 						tca[TCA_RATE-1]);
512 			if (err) {
513 				/*
514 				 * Any broken qdiscs that would require
515 				 * a ops->reset() here? The qdisc was never
516 				 * in action so it shouldn't be necessary.
517 				 */
518 				if (ops->destroy)
519 					ops->destroy(sch);
520 				goto err_out3;
521 			}
522 		}
523 		qdisc_lock_tree(dev);
524 		list_add_tail(&sch->list, &dev->qdisc_list);
525 		qdisc_unlock_tree(dev);
526 
527 		return sch;
528 	}
529 err_out3:
530 	dev_put(dev);
531 	kfree((char *) sch - sch->padded);
532 err_out2:
533 	module_put(ops->owner);
534 err_out:
535 	*errp = err;
536 	return NULL;
537 }
538 
539 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
540 {
541 	if (tca[TCA_OPTIONS-1]) {
542 		int err;
543 
544 		if (sch->ops->change == NULL)
545 			return -EINVAL;
546 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
547 		if (err)
548 			return err;
549 	}
550 	if (tca[TCA_RATE-1])
551 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
552 			sch->stats_lock, tca[TCA_RATE-1]);
553 	return 0;
554 }
555 
556 struct check_loop_arg
557 {
558 	struct qdisc_walker 	w;
559 	struct Qdisc		*p;
560 	int			depth;
561 };
562 
563 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
564 
565 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
566 {
567 	struct check_loop_arg	arg;
568 
569 	if (q->ops->cl_ops == NULL)
570 		return 0;
571 
572 	arg.w.stop = arg.w.skip = arg.w.count = 0;
573 	arg.w.fn = check_loop_fn;
574 	arg.depth = depth;
575 	arg.p = p;
576 	q->ops->cl_ops->walk(q, &arg.w);
577 	return arg.w.stop ? -ELOOP : 0;
578 }
579 
580 static int
581 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
582 {
583 	struct Qdisc *leaf;
584 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
585 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
586 
587 	leaf = cops->leaf(q, cl);
588 	if (leaf) {
589 		if (leaf == arg->p || arg->depth > 7)
590 			return -ELOOP;
591 		return check_loop(leaf, arg->p, arg->depth + 1);
592 	}
593 	return 0;
594 }
595 
596 /*
597  * Delete/get qdisc.
598  */
599 
600 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
601 {
602 	struct tcmsg *tcm = NLMSG_DATA(n);
603 	struct rtattr **tca = arg;
604 	struct net_device *dev;
605 	u32 clid = tcm->tcm_parent;
606 	struct Qdisc *q = NULL;
607 	struct Qdisc *p = NULL;
608 	int err;
609 
610 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
611 		return -ENODEV;
612 
613 	if (clid) {
614 		if (clid != TC_H_ROOT) {
615 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
616 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
617 					return -ENOENT;
618 				q = qdisc_leaf(p, clid);
619 			} else { /* ingress */
620 				q = dev->qdisc_ingress;
621 			}
622 		} else {
623 			q = dev->qdisc_sleeping;
624 		}
625 		if (!q)
626 			return -ENOENT;
627 
628 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
629 			return -EINVAL;
630 	} else {
631 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
632 			return -ENOENT;
633 	}
634 
635 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
636 		return -EINVAL;
637 
638 	if (n->nlmsg_type == RTM_DELQDISC) {
639 		if (!clid)
640 			return -EINVAL;
641 		if (q->handle == 0)
642 			return -ENOENT;
643 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
644 			return err;
645 		if (q) {
646 			qdisc_notify(skb, n, clid, q, NULL);
647 			qdisc_lock_tree(dev);
648 			qdisc_destroy(q);
649 			qdisc_unlock_tree(dev);
650 		}
651 	} else {
652 		qdisc_notify(skb, n, clid, NULL, q);
653 	}
654 	return 0;
655 }
656 
657 /*
658    Create/change qdisc.
659  */
660 
661 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
662 {
663 	struct tcmsg *tcm;
664 	struct rtattr **tca;
665 	struct net_device *dev;
666 	u32 clid;
667 	struct Qdisc *q, *p;
668 	int err;
669 
670 replay:
671 	/* Reinit, just in case something touches this. */
672 	tcm = NLMSG_DATA(n);
673 	tca = arg;
674 	clid = tcm->tcm_parent;
675 	q = p = NULL;
676 
677 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
678 		return -ENODEV;
679 
680 	if (clid) {
681 		if (clid != TC_H_ROOT) {
682 			if (clid != TC_H_INGRESS) {
683 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
684 					return -ENOENT;
685 				q = qdisc_leaf(p, clid);
686 			} else { /*ingress */
687 				q = dev->qdisc_ingress;
688 			}
689 		} else {
690 			q = dev->qdisc_sleeping;
691 		}
692 
693 		/* It may be default qdisc, ignore it */
694 		if (q && q->handle == 0)
695 			q = NULL;
696 
697 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
698 			if (tcm->tcm_handle) {
699 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
700 					return -EEXIST;
701 				if (TC_H_MIN(tcm->tcm_handle))
702 					return -EINVAL;
703 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
704 					goto create_n_graft;
705 				if (n->nlmsg_flags&NLM_F_EXCL)
706 					return -EEXIST;
707 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
708 					return -EINVAL;
709 				if (q == p ||
710 				    (p && check_loop(q, p, 0)))
711 					return -ELOOP;
712 				atomic_inc(&q->refcnt);
713 				goto graft;
714 			} else {
715 				if (q == NULL)
716 					goto create_n_graft;
717 
718 				/* This magic test requires explanation.
719 				 *
720 				 *   We know, that some child q is already
721 				 *   attached to this parent and have choice:
722 				 *   either to change it or to create/graft new one.
723 				 *
724 				 *   1. We are allowed to create/graft only
725 				 *   if CREATE and REPLACE flags are set.
726 				 *
727 				 *   2. If EXCL is set, requestor wanted to say,
728 				 *   that qdisc tcm_handle is not expected
729 				 *   to exist, so that we choose create/graft too.
730 				 *
731 				 *   3. The last case is when no flags are set.
732 				 *   Alas, it is sort of hole in API, we
733 				 *   cannot decide what to do unambiguously.
734 				 *   For now we select create/graft, if
735 				 *   user gave KIND, which does not match existing.
736 				 */
737 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
738 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
739 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
740 				     (tca[TCA_KIND-1] &&
741 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
742 					goto create_n_graft;
743 			}
744 		}
745 	} else {
746 		if (!tcm->tcm_handle)
747 			return -EINVAL;
748 		q = qdisc_lookup(dev, tcm->tcm_handle);
749 	}
750 
751 	/* Change qdisc parameters */
752 	if (q == NULL)
753 		return -ENOENT;
754 	if (n->nlmsg_flags&NLM_F_EXCL)
755 		return -EEXIST;
756 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
757 		return -EINVAL;
758 	err = qdisc_change(q, tca);
759 	if (err == 0)
760 		qdisc_notify(skb, n, clid, NULL, q);
761 	return err;
762 
763 create_n_graft:
764 	if (!(n->nlmsg_flags&NLM_F_CREATE))
765 		return -ENOENT;
766 	if (clid == TC_H_INGRESS)
767 		q = qdisc_create(dev, tcm->tcm_parent, tcm->tcm_parent,
768 				 tca, &err);
769 	else
770 		q = qdisc_create(dev, tcm->tcm_parent, tcm->tcm_handle,
771 				 tca, &err);
772 	if (q == NULL) {
773 		if (err == -EAGAIN)
774 			goto replay;
775 		return err;
776 	}
777 
778 graft:
779 	if (1) {
780 		struct Qdisc *old_q = NULL;
781 		err = qdisc_graft(dev, p, clid, q, &old_q);
782 		if (err) {
783 			if (q) {
784 				qdisc_lock_tree(dev);
785 				qdisc_destroy(q);
786 				qdisc_unlock_tree(dev);
787 			}
788 			return err;
789 		}
790 		qdisc_notify(skb, n, clid, old_q, q);
791 		if (old_q) {
792 			qdisc_lock_tree(dev);
793 			qdisc_destroy(old_q);
794 			qdisc_unlock_tree(dev);
795 		}
796 	}
797 	return 0;
798 }
799 
800 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
801 			 u32 pid, u32 seq, u16 flags, int event)
802 {
803 	struct tcmsg *tcm;
804 	struct nlmsghdr  *nlh;
805 	unsigned char *b = skb_tail_pointer(skb);
806 	struct gnet_dump d;
807 
808 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
809 	tcm = NLMSG_DATA(nlh);
810 	tcm->tcm_family = AF_UNSPEC;
811 	tcm->tcm__pad1 = 0;
812 	tcm->tcm__pad2 = 0;
813 	tcm->tcm_ifindex = q->dev->ifindex;
814 	tcm->tcm_parent = clid;
815 	tcm->tcm_handle = q->handle;
816 	tcm->tcm_info = atomic_read(&q->refcnt);
817 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
818 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
819 		goto rtattr_failure;
820 	q->qstats.qlen = q->q.qlen;
821 
822 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
823 			TCA_XSTATS, q->stats_lock, &d) < 0)
824 		goto rtattr_failure;
825 
826 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
827 		goto rtattr_failure;
828 
829 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
830 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
831 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
832 		goto rtattr_failure;
833 
834 	if (gnet_stats_finish_copy(&d) < 0)
835 		goto rtattr_failure;
836 
837 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
838 	return skb->len;
839 
840 nlmsg_failure:
841 rtattr_failure:
842 	nlmsg_trim(skb, b);
843 	return -1;
844 }
845 
846 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
847 			u32 clid, struct Qdisc *old, struct Qdisc *new)
848 {
849 	struct sk_buff *skb;
850 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
851 
852 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
853 	if (!skb)
854 		return -ENOBUFS;
855 
856 	if (old && old->handle) {
857 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
858 			goto err_out;
859 	}
860 	if (new) {
861 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
862 			goto err_out;
863 	}
864 
865 	if (skb->len)
866 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
867 
868 err_out:
869 	kfree_skb(skb);
870 	return -EINVAL;
871 }
872 
873 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
874 {
875 	int idx, q_idx;
876 	int s_idx, s_q_idx;
877 	struct net_device *dev;
878 	struct Qdisc *q;
879 
880 	s_idx = cb->args[0];
881 	s_q_idx = q_idx = cb->args[1];
882 	read_lock(&dev_base_lock);
883 	idx = 0;
884 	for_each_netdev(&init_net, dev) {
885 		if (idx < s_idx)
886 			goto cont;
887 		if (idx > s_idx)
888 			s_q_idx = 0;
889 		q_idx = 0;
890 		list_for_each_entry(q, &dev->qdisc_list, list) {
891 			if (q_idx < s_q_idx) {
892 				q_idx++;
893 				continue;
894 			}
895 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
896 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
897 				goto done;
898 			q_idx++;
899 		}
900 cont:
901 		idx++;
902 	}
903 
904 done:
905 	read_unlock(&dev_base_lock);
906 
907 	cb->args[0] = idx;
908 	cb->args[1] = q_idx;
909 
910 	return skb->len;
911 }
912 
913 
914 
915 /************************************************
916  *	Traffic classes manipulation.		*
917  ************************************************/
918 
919 
920 
921 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
922 {
923 	struct tcmsg *tcm = NLMSG_DATA(n);
924 	struct rtattr **tca = arg;
925 	struct net_device *dev;
926 	struct Qdisc *q = NULL;
927 	struct Qdisc_class_ops *cops;
928 	unsigned long cl = 0;
929 	unsigned long new_cl;
930 	u32 pid = tcm->tcm_parent;
931 	u32 clid = tcm->tcm_handle;
932 	u32 qid = TC_H_MAJ(clid);
933 	int err;
934 
935 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
936 		return -ENODEV;
937 
938 	/*
939 	   parent == TC_H_UNSPEC - unspecified parent.
940 	   parent == TC_H_ROOT   - class is root, which has no parent.
941 	   parent == X:0	 - parent is root class.
942 	   parent == X:Y	 - parent is a node in hierarchy.
943 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
944 
945 	   handle == 0:0	 - generate handle from kernel pool.
946 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
947 	   handle == X:Y	 - clear.
948 	   handle == X:0	 - root class.
949 	 */
950 
951 	/* Step 1. Determine qdisc handle X:0 */
952 
953 	if (pid != TC_H_ROOT) {
954 		u32 qid1 = TC_H_MAJ(pid);
955 
956 		if (qid && qid1) {
957 			/* If both majors are known, they must be identical. */
958 			if (qid != qid1)
959 				return -EINVAL;
960 		} else if (qid1) {
961 			qid = qid1;
962 		} else if (qid == 0)
963 			qid = dev->qdisc_sleeping->handle;
964 
965 		/* Now qid is genuine qdisc handle consistent
966 		   both with parent and child.
967 
968 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
969 		 */
970 		if (pid)
971 			pid = TC_H_MAKE(qid, pid);
972 	} else {
973 		if (qid == 0)
974 			qid = dev->qdisc_sleeping->handle;
975 	}
976 
977 	/* OK. Locate qdisc */
978 	if ((q = qdisc_lookup(dev, qid)) == NULL)
979 		return -ENOENT;
980 
981 	/* An check that it supports classes */
982 	cops = q->ops->cl_ops;
983 	if (cops == NULL)
984 		return -EINVAL;
985 
986 	/* Now try to get class */
987 	if (clid == 0) {
988 		if (pid == TC_H_ROOT)
989 			clid = qid;
990 	} else
991 		clid = TC_H_MAKE(qid, clid);
992 
993 	if (clid)
994 		cl = cops->get(q, clid);
995 
996 	if (cl == 0) {
997 		err = -ENOENT;
998 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
999 			goto out;
1000 	} else {
1001 		switch (n->nlmsg_type) {
1002 		case RTM_NEWTCLASS:
1003 			err = -EEXIST;
1004 			if (n->nlmsg_flags&NLM_F_EXCL)
1005 				goto out;
1006 			break;
1007 		case RTM_DELTCLASS:
1008 			err = cops->delete(q, cl);
1009 			if (err == 0)
1010 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1011 			goto out;
1012 		case RTM_GETTCLASS:
1013 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1014 			goto out;
1015 		default:
1016 			err = -EINVAL;
1017 			goto out;
1018 		}
1019 	}
1020 
1021 	new_cl = cl;
1022 	err = cops->change(q, clid, pid, tca, &new_cl);
1023 	if (err == 0)
1024 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1025 
1026 out:
1027 	if (cl)
1028 		cops->put(q, cl);
1029 
1030 	return err;
1031 }
1032 
1033 
1034 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1035 			  unsigned long cl,
1036 			  u32 pid, u32 seq, u16 flags, int event)
1037 {
1038 	struct tcmsg *tcm;
1039 	struct nlmsghdr  *nlh;
1040 	unsigned char *b = skb_tail_pointer(skb);
1041 	struct gnet_dump d;
1042 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1043 
1044 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1045 	tcm = NLMSG_DATA(nlh);
1046 	tcm->tcm_family = AF_UNSPEC;
1047 	tcm->tcm_ifindex = q->dev->ifindex;
1048 	tcm->tcm_parent = q->handle;
1049 	tcm->tcm_handle = q->handle;
1050 	tcm->tcm_info = 0;
1051 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1052 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1053 		goto rtattr_failure;
1054 
1055 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1056 			TCA_XSTATS, q->stats_lock, &d) < 0)
1057 		goto rtattr_failure;
1058 
1059 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1060 		goto rtattr_failure;
1061 
1062 	if (gnet_stats_finish_copy(&d) < 0)
1063 		goto rtattr_failure;
1064 
1065 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1066 	return skb->len;
1067 
1068 nlmsg_failure:
1069 rtattr_failure:
1070 	nlmsg_trim(skb, b);
1071 	return -1;
1072 }
1073 
1074 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1075 			  struct Qdisc *q, unsigned long cl, int event)
1076 {
1077 	struct sk_buff *skb;
1078 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1079 
1080 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1081 	if (!skb)
1082 		return -ENOBUFS;
1083 
1084 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1085 		kfree_skb(skb);
1086 		return -EINVAL;
1087 	}
1088 
1089 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1090 }
1091 
1092 struct qdisc_dump_args
1093 {
1094 	struct qdisc_walker w;
1095 	struct sk_buff *skb;
1096 	struct netlink_callback *cb;
1097 };
1098 
1099 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1100 {
1101 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1102 
1103 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1104 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1105 }
1106 
1107 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1108 {
1109 	int t;
1110 	int s_t;
1111 	struct net_device *dev;
1112 	struct Qdisc *q;
1113 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1114 	struct qdisc_dump_args arg;
1115 
1116 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1117 		return 0;
1118 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1119 		return 0;
1120 
1121 	s_t = cb->args[0];
1122 	t = 0;
1123 
1124 	list_for_each_entry(q, &dev->qdisc_list, list) {
1125 		if (t < s_t || !q->ops->cl_ops ||
1126 		    (tcm->tcm_parent &&
1127 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1128 			t++;
1129 			continue;
1130 		}
1131 		if (t > s_t)
1132 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1133 		arg.w.fn = qdisc_class_dump;
1134 		arg.skb = skb;
1135 		arg.cb = cb;
1136 		arg.w.stop  = 0;
1137 		arg.w.skip = cb->args[1];
1138 		arg.w.count = 0;
1139 		q->ops->cl_ops->walk(q, &arg.w);
1140 		cb->args[1] = arg.w.count;
1141 		if (arg.w.stop)
1142 			break;
1143 		t++;
1144 	}
1145 
1146 	cb->args[0] = t;
1147 
1148 	dev_put(dev);
1149 	return skb->len;
1150 }
1151 
1152 /* Main classifier routine: scans classifier chain attached
1153    to this qdisc, (optionally) tests for protocol and asks
1154    specific classifiers.
1155  */
1156 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1157 		       struct tcf_result *res)
1158 {
1159 	__be16 protocol = skb->protocol;
1160 	int err = 0;
1161 
1162 	for (; tp; tp = tp->next) {
1163 		if ((tp->protocol == protocol ||
1164 		     tp->protocol == htons(ETH_P_ALL)) &&
1165 		    (err = tp->classify(skb, tp, res)) >= 0) {
1166 #ifdef CONFIG_NET_CLS_ACT
1167 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1168 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1169 #endif
1170 			return err;
1171 		}
1172 	}
1173 	return -1;
1174 }
1175 EXPORT_SYMBOL(tc_classify_compat);
1176 
1177 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1178 		struct tcf_result *res)
1179 {
1180 	int err = 0;
1181 	__be16 protocol;
1182 #ifdef CONFIG_NET_CLS_ACT
1183 	struct tcf_proto *otp = tp;
1184 reclassify:
1185 #endif
1186 	protocol = skb->protocol;
1187 
1188 	err = tc_classify_compat(skb, tp, res);
1189 #ifdef CONFIG_NET_CLS_ACT
1190 	if (err == TC_ACT_RECLASSIFY) {
1191 		u32 verd = G_TC_VERD(skb->tc_verd);
1192 		tp = otp;
1193 
1194 		if (verd++ >= MAX_REC_LOOP) {
1195 			printk("rule prio %u protocol %02x reclassify loop, "
1196 			       "packet dropped\n",
1197 			       tp->prio&0xffff, ntohs(tp->protocol));
1198 			return TC_ACT_SHOT;
1199 		}
1200 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1201 		goto reclassify;
1202 	}
1203 #endif
1204 	return err;
1205 }
1206 EXPORT_SYMBOL(tc_classify);
1207 
1208 void tcf_destroy(struct tcf_proto *tp)
1209 {
1210 	tp->ops->destroy(tp);
1211 	module_put(tp->ops->owner);
1212 	kfree(tp);
1213 }
1214 
1215 void tcf_destroy_chain(struct tcf_proto *fl)
1216 {
1217 	struct tcf_proto *tp;
1218 
1219 	while ((tp = fl) != NULL) {
1220 		fl = tp->next;
1221 		tcf_destroy(tp);
1222 	}
1223 }
1224 EXPORT_SYMBOL(tcf_destroy_chain);
1225 
1226 #ifdef CONFIG_PROC_FS
1227 static int psched_show(struct seq_file *seq, void *v)
1228 {
1229 	struct timespec ts;
1230 
1231 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1232 	seq_printf(seq, "%08x %08x %08x %08x\n",
1233 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1234 		   1000000,
1235 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1236 
1237 	return 0;
1238 }
1239 
1240 static int psched_open(struct inode *inode, struct file *file)
1241 {
1242 	return single_open(file, psched_show, PDE(inode)->data);
1243 }
1244 
1245 static const struct file_operations psched_fops = {
1246 	.owner = THIS_MODULE,
1247 	.open = psched_open,
1248 	.read  = seq_read,
1249 	.llseek = seq_lseek,
1250 	.release = single_release,
1251 };
1252 #endif
1253 
1254 static int __init pktsched_init(void)
1255 {
1256 	register_qdisc(&pfifo_qdisc_ops);
1257 	register_qdisc(&bfifo_qdisc_ops);
1258 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1259 
1260 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1261 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1262 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1263 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1264 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1265 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1266 
1267 	return 0;
1268 }
1269 
1270 subsys_initcall(pktsched_init);
1271 
1272 EXPORT_SYMBOL(qdisc_get_rtab);
1273 EXPORT_SYMBOL(qdisc_put_rtab);
1274 EXPORT_SYMBOL(register_qdisc);
1275 EXPORT_SYMBOL(unregister_qdisc);
1276