xref: /openbmc/linux/net/sched/sch_api.c (revision e868d61272caa648214046a096e5a6bfc068dc8c)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/init.h>
31 #include <linux/proc_fs.h>
32 #include <linux/seq_file.h>
33 #include <linux/kmod.h>
34 #include <linux/list.h>
35 #include <linux/bitops.h>
36 #include <linux/hrtimer.h>
37 
38 #include <net/netlink.h>
39 #include <net/sock.h>
40 #include <net/pkt_sched.h>
41 
42 #include <asm/processor.h>
43 #include <asm/uaccess.h>
44 #include <asm/system.h>
45 
46 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 			struct Qdisc *old, struct Qdisc *new);
48 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 			 struct Qdisc *q, unsigned long cl, int event);
50 
51 /*
52 
53    Short review.
54    -------------
55 
56    This file consists of two interrelated parts:
57 
58    1. queueing disciplines manager frontend.
59    2. traffic classes manager frontend.
60 
61    Generally, queueing discipline ("qdisc") is a black box,
62    which is able to enqueue packets and to dequeue them (when
63    device is ready to send something) in order and at times
64    determined by algorithm hidden in it.
65 
66    qdisc's are divided to two categories:
67    - "queues", which have no internal structure visible from outside.
68    - "schedulers", which split all the packets to "traffic classes",
69      using "packet classifiers" (look at cls_api.c)
70 
71    In turn, classes may have child qdiscs (as rule, queues)
72    attached to them etc. etc. etc.
73 
74    The goal of the routines in this file is to translate
75    information supplied by user in the form of handles
76    to more intelligible for kernel form, to make some sanity
77    checks and part of work, which is common to all qdiscs
78    and to provide rtnetlink notifications.
79 
80    All real intelligent work is done inside qdisc modules.
81 
82 
83 
84    Every discipline has two major routines: enqueue and dequeue.
85 
86    ---dequeue
87 
88    dequeue usually returns a skb to send. It is allowed to return NULL,
89    but it does not mean that queue is empty, it just means that
90    discipline does not want to send anything this time.
91    Queue is really empty if q->q.qlen == 0.
92    For complicated disciplines with multiple queues q->q is not
93    real packet queue, but however q->q.qlen must be valid.
94 
95    ---enqueue
96 
97    enqueue returns 0, if packet was enqueued successfully.
98    If packet (this one or another one) was dropped, it returns
99    not zero error code.
100    NET_XMIT_DROP 	- this packet dropped
101      Expected action: do not backoff, but wait until queue will clear.
102    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
103      Expected action: backoff or ignore
104    NET_XMIT_POLICED	- dropped by police.
105      Expected action: backoff or error to real-time apps.
106 
107    Auxiliary routines:
108 
109    ---requeue
110 
111    requeues once dequeued packet. It is used for non-standard or
112    just buggy devices, which can defer output even if dev->tbusy=0.
113 
114    ---reset
115 
116    returns qdisc to initial state: purge all buffers, clear all
117    timers, counters (except for statistics) etc.
118 
119    ---init
120 
121    initializes newly created qdisc.
122 
123    ---destroy
124 
125    destroys resources allocated by init and during lifetime of qdisc.
126 
127    ---change
128 
129    changes qdisc parameters.
130  */
131 
132 /* Protects list of registered TC modules. It is pure SMP lock. */
133 static DEFINE_RWLOCK(qdisc_mod_lock);
134 
135 
136 /************************************************
137  *	Queueing disciplines manipulation.	*
138  ************************************************/
139 
140 
141 /* The list of all installed queueing disciplines. */
142 
143 static struct Qdisc_ops *qdisc_base;
144 
145 /* Register/uregister queueing discipline */
146 
147 int register_qdisc(struct Qdisc_ops *qops)
148 {
149 	struct Qdisc_ops *q, **qp;
150 	int rc = -EEXIST;
151 
152 	write_lock(&qdisc_mod_lock);
153 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 		if (!strcmp(qops->id, q->id))
155 			goto out;
156 
157 	if (qops->enqueue == NULL)
158 		qops->enqueue = noop_qdisc_ops.enqueue;
159 	if (qops->requeue == NULL)
160 		qops->requeue = noop_qdisc_ops.requeue;
161 	if (qops->dequeue == NULL)
162 		qops->dequeue = noop_qdisc_ops.dequeue;
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 }
171 
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 
190 /* We know handle. Find qdisc among all qdisc's attached to device
191    (root qdisc, all its children, children of children etc.)
192  */
193 
194 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195 {
196 	struct Qdisc *q;
197 
198 	list_for_each_entry(q, &dev->qdisc_list, list) {
199 		if (q->handle == handle)
200 			return q;
201 	}
202 	return NULL;
203 }
204 
205 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
206 {
207 	unsigned long cl;
208 	struct Qdisc *leaf;
209 	struct Qdisc_class_ops *cops = p->ops->cl_ops;
210 
211 	if (cops == NULL)
212 		return NULL;
213 	cl = cops->get(p, classid);
214 
215 	if (cl == 0)
216 		return NULL;
217 	leaf = cops->leaf(p, cl);
218 	cops->put(p, cl);
219 	return leaf;
220 }
221 
222 /* Find queueing discipline by name */
223 
224 static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
225 {
226 	struct Qdisc_ops *q = NULL;
227 
228 	if (kind) {
229 		read_lock(&qdisc_mod_lock);
230 		for (q = qdisc_base; q; q = q->next) {
231 			if (rtattr_strcmp(kind, q->id) == 0) {
232 				if (!try_module_get(q->owner))
233 					q = NULL;
234 				break;
235 			}
236 		}
237 		read_unlock(&qdisc_mod_lock);
238 	}
239 	return q;
240 }
241 
242 static struct qdisc_rate_table *qdisc_rtab_list;
243 
244 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
245 {
246 	struct qdisc_rate_table *rtab;
247 
248 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
249 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
250 			rtab->refcnt++;
251 			return rtab;
252 		}
253 	}
254 
255 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
256 		return NULL;
257 
258 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
259 	if (rtab) {
260 		rtab->rate = *r;
261 		rtab->refcnt = 1;
262 		memcpy(rtab->data, RTA_DATA(tab), 1024);
263 		rtab->next = qdisc_rtab_list;
264 		qdisc_rtab_list = rtab;
265 	}
266 	return rtab;
267 }
268 
269 void qdisc_put_rtab(struct qdisc_rate_table *tab)
270 {
271 	struct qdisc_rate_table *rtab, **rtabp;
272 
273 	if (!tab || --tab->refcnt)
274 		return;
275 
276 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
277 		if (rtab == tab) {
278 			*rtabp = rtab->next;
279 			kfree(rtab);
280 			return;
281 		}
282 	}
283 }
284 
285 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
286 {
287 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
288 						 timer);
289 	struct net_device *dev = wd->qdisc->dev;
290 
291 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
292 	smp_wmb();
293 	if (spin_trylock(&dev->queue_lock)) {
294 		qdisc_run(dev);
295 		spin_unlock(&dev->queue_lock);
296 	} else
297 		netif_schedule(dev);
298 
299 	return HRTIMER_NORESTART;
300 }
301 
302 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
303 {
304 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
305 	wd->timer.function = qdisc_watchdog;
306 	wd->qdisc = qdisc;
307 }
308 EXPORT_SYMBOL(qdisc_watchdog_init);
309 
310 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
311 {
312 	ktime_t time;
313 
314 	wd->qdisc->flags |= TCQ_F_THROTTLED;
315 	time = ktime_set(0, 0);
316 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
317 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
318 }
319 EXPORT_SYMBOL(qdisc_watchdog_schedule);
320 
321 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
322 {
323 	hrtimer_cancel(&wd->timer);
324 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
325 }
326 EXPORT_SYMBOL(qdisc_watchdog_cancel);
327 
328 /* Allocate an unique handle from space managed by kernel */
329 
330 static u32 qdisc_alloc_handle(struct net_device *dev)
331 {
332 	int i = 0x10000;
333 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
334 
335 	do {
336 		autohandle += TC_H_MAKE(0x10000U, 0);
337 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
338 			autohandle = TC_H_MAKE(0x80000000U, 0);
339 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
340 
341 	return i>0 ? autohandle : 0;
342 }
343 
344 /* Attach toplevel qdisc to device dev */
345 
346 static struct Qdisc *
347 dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
348 {
349 	struct Qdisc *oqdisc;
350 
351 	if (dev->flags & IFF_UP)
352 		dev_deactivate(dev);
353 
354 	qdisc_lock_tree(dev);
355 	if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
356 		oqdisc = dev->qdisc_ingress;
357 		/* Prune old scheduler */
358 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
359 			/* delete */
360 			qdisc_reset(oqdisc);
361 			dev->qdisc_ingress = NULL;
362 		} else {  /* new */
363 			dev->qdisc_ingress = qdisc;
364 		}
365 
366 	} else {
367 
368 		oqdisc = dev->qdisc_sleeping;
369 
370 		/* Prune old scheduler */
371 		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
372 			qdisc_reset(oqdisc);
373 
374 		/* ... and graft new one */
375 		if (qdisc == NULL)
376 			qdisc = &noop_qdisc;
377 		dev->qdisc_sleeping = qdisc;
378 		dev->qdisc = &noop_qdisc;
379 	}
380 
381 	qdisc_unlock_tree(dev);
382 
383 	if (dev->flags & IFF_UP)
384 		dev_activate(dev);
385 
386 	return oqdisc;
387 }
388 
389 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
390 {
391 	struct Qdisc_class_ops *cops;
392 	unsigned long cl;
393 	u32 parentid;
394 
395 	if (n == 0)
396 		return;
397 	while ((parentid = sch->parent)) {
398 		sch = qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
399 		cops = sch->ops->cl_ops;
400 		if (cops->qlen_notify) {
401 			cl = cops->get(sch, parentid);
402 			cops->qlen_notify(sch, cl);
403 			cops->put(sch, cl);
404 		}
405 		sch->q.qlen -= n;
406 	}
407 }
408 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
409 
410 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
411    to device "dev".
412 
413    Old qdisc is not destroyed but returned in *old.
414  */
415 
416 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
417 		       u32 classid,
418 		       struct Qdisc *new, struct Qdisc **old)
419 {
420 	int err = 0;
421 	struct Qdisc *q = *old;
422 
423 
424 	if (parent == NULL) {
425 		if (q && q->flags&TCQ_F_INGRESS) {
426 			*old = dev_graft_qdisc(dev, q);
427 		} else {
428 			*old = dev_graft_qdisc(dev, new);
429 		}
430 	} else {
431 		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
432 
433 		err = -EINVAL;
434 
435 		if (cops) {
436 			unsigned long cl = cops->get(parent, classid);
437 			if (cl) {
438 				err = cops->graft(parent, cl, new, old);
439 				if (new)
440 					new->parent = classid;
441 				cops->put(parent, cl);
442 			}
443 		}
444 	}
445 	return err;
446 }
447 
448 /*
449    Allocate and initialize new qdisc.
450 
451    Parameters are passed via opt.
452  */
453 
454 static struct Qdisc *
455 qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
456 {
457 	int err;
458 	struct rtattr *kind = tca[TCA_KIND-1];
459 	struct Qdisc *sch;
460 	struct Qdisc_ops *ops;
461 
462 	ops = qdisc_lookup_ops(kind);
463 #ifdef CONFIG_KMOD
464 	if (ops == NULL && kind != NULL) {
465 		char name[IFNAMSIZ];
466 		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
467 			/* We dropped the RTNL semaphore in order to
468 			 * perform the module load.  So, even if we
469 			 * succeeded in loading the module we have to
470 			 * tell the caller to replay the request.  We
471 			 * indicate this using -EAGAIN.
472 			 * We replay the request because the device may
473 			 * go away in the mean time.
474 			 */
475 			rtnl_unlock();
476 			request_module("sch_%s", name);
477 			rtnl_lock();
478 			ops = qdisc_lookup_ops(kind);
479 			if (ops != NULL) {
480 				/* We will try again qdisc_lookup_ops,
481 				 * so don't keep a reference.
482 				 */
483 				module_put(ops->owner);
484 				err = -EAGAIN;
485 				goto err_out;
486 			}
487 		}
488 	}
489 #endif
490 
491 	err = -ENOENT;
492 	if (ops == NULL)
493 		goto err_out;
494 
495 	sch = qdisc_alloc(dev, ops);
496 	if (IS_ERR(sch)) {
497 		err = PTR_ERR(sch);
498 		goto err_out2;
499 	}
500 
501 	if (handle == TC_H_INGRESS) {
502 		sch->flags |= TCQ_F_INGRESS;
503 		sch->stats_lock = &dev->ingress_lock;
504 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
505 	} else {
506 		sch->stats_lock = &dev->queue_lock;
507 		if (handle == 0) {
508 			handle = qdisc_alloc_handle(dev);
509 			err = -ENOMEM;
510 			if (handle == 0)
511 				goto err_out3;
512 		}
513 	}
514 
515 	sch->handle = handle;
516 
517 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
518 #ifdef CONFIG_NET_ESTIMATOR
519 		if (tca[TCA_RATE-1]) {
520 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
521 						sch->stats_lock,
522 						tca[TCA_RATE-1]);
523 			if (err) {
524 				/*
525 				 * Any broken qdiscs that would require
526 				 * a ops->reset() here? The qdisc was never
527 				 * in action so it shouldn't be necessary.
528 				 */
529 				if (ops->destroy)
530 					ops->destroy(sch);
531 				goto err_out3;
532 			}
533 		}
534 #endif
535 		qdisc_lock_tree(dev);
536 		list_add_tail(&sch->list, &dev->qdisc_list);
537 		qdisc_unlock_tree(dev);
538 
539 		return sch;
540 	}
541 err_out3:
542 	dev_put(dev);
543 	kfree((char *) sch - sch->padded);
544 err_out2:
545 	module_put(ops->owner);
546 err_out:
547 	*errp = err;
548 	return NULL;
549 }
550 
551 static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
552 {
553 	if (tca[TCA_OPTIONS-1]) {
554 		int err;
555 
556 		if (sch->ops->change == NULL)
557 			return -EINVAL;
558 		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
559 		if (err)
560 			return err;
561 	}
562 #ifdef CONFIG_NET_ESTIMATOR
563 	if (tca[TCA_RATE-1])
564 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
565 			sch->stats_lock, tca[TCA_RATE-1]);
566 #endif
567 	return 0;
568 }
569 
570 struct check_loop_arg
571 {
572 	struct qdisc_walker 	w;
573 	struct Qdisc		*p;
574 	int			depth;
575 };
576 
577 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
578 
579 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
580 {
581 	struct check_loop_arg	arg;
582 
583 	if (q->ops->cl_ops == NULL)
584 		return 0;
585 
586 	arg.w.stop = arg.w.skip = arg.w.count = 0;
587 	arg.w.fn = check_loop_fn;
588 	arg.depth = depth;
589 	arg.p = p;
590 	q->ops->cl_ops->walk(q, &arg.w);
591 	return arg.w.stop ? -ELOOP : 0;
592 }
593 
594 static int
595 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
596 {
597 	struct Qdisc *leaf;
598 	struct Qdisc_class_ops *cops = q->ops->cl_ops;
599 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
600 
601 	leaf = cops->leaf(q, cl);
602 	if (leaf) {
603 		if (leaf == arg->p || arg->depth > 7)
604 			return -ELOOP;
605 		return check_loop(leaf, arg->p, arg->depth + 1);
606 	}
607 	return 0;
608 }
609 
610 /*
611  * Delete/get qdisc.
612  */
613 
614 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615 {
616 	struct tcmsg *tcm = NLMSG_DATA(n);
617 	struct rtattr **tca = arg;
618 	struct net_device *dev;
619 	u32 clid = tcm->tcm_parent;
620 	struct Qdisc *q = NULL;
621 	struct Qdisc *p = NULL;
622 	int err;
623 
624 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
625 		return -ENODEV;
626 
627 	if (clid) {
628 		if (clid != TC_H_ROOT) {
629 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
630 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
631 					return -ENOENT;
632 				q = qdisc_leaf(p, clid);
633 			} else { /* ingress */
634 				q = dev->qdisc_ingress;
635 			}
636 		} else {
637 			q = dev->qdisc_sleeping;
638 		}
639 		if (!q)
640 			return -ENOENT;
641 
642 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
643 			return -EINVAL;
644 	} else {
645 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
646 			return -ENOENT;
647 	}
648 
649 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
650 		return -EINVAL;
651 
652 	if (n->nlmsg_type == RTM_DELQDISC) {
653 		if (!clid)
654 			return -EINVAL;
655 		if (q->handle == 0)
656 			return -ENOENT;
657 		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
658 			return err;
659 		if (q) {
660 			qdisc_notify(skb, n, clid, q, NULL);
661 			qdisc_lock_tree(dev);
662 			qdisc_destroy(q);
663 			qdisc_unlock_tree(dev);
664 		}
665 	} else {
666 		qdisc_notify(skb, n, clid, NULL, q);
667 	}
668 	return 0;
669 }
670 
671 /*
672    Create/change qdisc.
673  */
674 
675 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
676 {
677 	struct tcmsg *tcm;
678 	struct rtattr **tca;
679 	struct net_device *dev;
680 	u32 clid;
681 	struct Qdisc *q, *p;
682 	int err;
683 
684 replay:
685 	/* Reinit, just in case something touches this. */
686 	tcm = NLMSG_DATA(n);
687 	tca = arg;
688 	clid = tcm->tcm_parent;
689 	q = p = NULL;
690 
691 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
692 		return -ENODEV;
693 
694 	if (clid) {
695 		if (clid != TC_H_ROOT) {
696 			if (clid != TC_H_INGRESS) {
697 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
698 					return -ENOENT;
699 				q = qdisc_leaf(p, clid);
700 			} else { /*ingress */
701 				q = dev->qdisc_ingress;
702 			}
703 		} else {
704 			q = dev->qdisc_sleeping;
705 		}
706 
707 		/* It may be default qdisc, ignore it */
708 		if (q && q->handle == 0)
709 			q = NULL;
710 
711 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
712 			if (tcm->tcm_handle) {
713 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
714 					return -EEXIST;
715 				if (TC_H_MIN(tcm->tcm_handle))
716 					return -EINVAL;
717 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
718 					goto create_n_graft;
719 				if (n->nlmsg_flags&NLM_F_EXCL)
720 					return -EEXIST;
721 				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
722 					return -EINVAL;
723 				if (q == p ||
724 				    (p && check_loop(q, p, 0)))
725 					return -ELOOP;
726 				atomic_inc(&q->refcnt);
727 				goto graft;
728 			} else {
729 				if (q == NULL)
730 					goto create_n_graft;
731 
732 				/* This magic test requires explanation.
733 				 *
734 				 *   We know, that some child q is already
735 				 *   attached to this parent and have choice:
736 				 *   either to change it or to create/graft new one.
737 				 *
738 				 *   1. We are allowed to create/graft only
739 				 *   if CREATE and REPLACE flags are set.
740 				 *
741 				 *   2. If EXCL is set, requestor wanted to say,
742 				 *   that qdisc tcm_handle is not expected
743 				 *   to exist, so that we choose create/graft too.
744 				 *
745 				 *   3. The last case is when no flags are set.
746 				 *   Alas, it is sort of hole in API, we
747 				 *   cannot decide what to do unambiguously.
748 				 *   For now we select create/graft, if
749 				 *   user gave KIND, which does not match existing.
750 				 */
751 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
752 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
753 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
754 				     (tca[TCA_KIND-1] &&
755 				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
756 					goto create_n_graft;
757 			}
758 		}
759 	} else {
760 		if (!tcm->tcm_handle)
761 			return -EINVAL;
762 		q = qdisc_lookup(dev, tcm->tcm_handle);
763 	}
764 
765 	/* Change qdisc parameters */
766 	if (q == NULL)
767 		return -ENOENT;
768 	if (n->nlmsg_flags&NLM_F_EXCL)
769 		return -EEXIST;
770 	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
771 		return -EINVAL;
772 	err = qdisc_change(q, tca);
773 	if (err == 0)
774 		qdisc_notify(skb, n, clid, NULL, q);
775 	return err;
776 
777 create_n_graft:
778 	if (!(n->nlmsg_flags&NLM_F_CREATE))
779 		return -ENOENT;
780 	if (clid == TC_H_INGRESS)
781 		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
782 	else
783 		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
784 	if (q == NULL) {
785 		if (err == -EAGAIN)
786 			goto replay;
787 		return err;
788 	}
789 
790 graft:
791 	if (1) {
792 		struct Qdisc *old_q = NULL;
793 		err = qdisc_graft(dev, p, clid, q, &old_q);
794 		if (err) {
795 			if (q) {
796 				qdisc_lock_tree(dev);
797 				qdisc_destroy(q);
798 				qdisc_unlock_tree(dev);
799 			}
800 			return err;
801 		}
802 		qdisc_notify(skb, n, clid, old_q, q);
803 		if (old_q) {
804 			qdisc_lock_tree(dev);
805 			qdisc_destroy(old_q);
806 			qdisc_unlock_tree(dev);
807 		}
808 	}
809 	return 0;
810 }
811 
812 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
813 			 u32 pid, u32 seq, u16 flags, int event)
814 {
815 	struct tcmsg *tcm;
816 	struct nlmsghdr  *nlh;
817 	unsigned char *b = skb_tail_pointer(skb);
818 	struct gnet_dump d;
819 
820 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
821 	tcm = NLMSG_DATA(nlh);
822 	tcm->tcm_family = AF_UNSPEC;
823 	tcm->tcm__pad1 = 0;
824 	tcm->tcm__pad2 = 0;
825 	tcm->tcm_ifindex = q->dev->ifindex;
826 	tcm->tcm_parent = clid;
827 	tcm->tcm_handle = q->handle;
828 	tcm->tcm_info = atomic_read(&q->refcnt);
829 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
830 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
831 		goto rtattr_failure;
832 	q->qstats.qlen = q->q.qlen;
833 
834 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
835 			TCA_XSTATS, q->stats_lock, &d) < 0)
836 		goto rtattr_failure;
837 
838 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
839 		goto rtattr_failure;
840 
841 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
842 #ifdef CONFIG_NET_ESTIMATOR
843 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
844 #endif
845 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
846 		goto rtattr_failure;
847 
848 	if (gnet_stats_finish_copy(&d) < 0)
849 		goto rtattr_failure;
850 
851 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
852 	return skb->len;
853 
854 nlmsg_failure:
855 rtattr_failure:
856 	nlmsg_trim(skb, b);
857 	return -1;
858 }
859 
860 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
861 			u32 clid, struct Qdisc *old, struct Qdisc *new)
862 {
863 	struct sk_buff *skb;
864 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
865 
866 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
867 	if (!skb)
868 		return -ENOBUFS;
869 
870 	if (old && old->handle) {
871 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
872 			goto err_out;
873 	}
874 	if (new) {
875 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
876 			goto err_out;
877 	}
878 
879 	if (skb->len)
880 		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
881 
882 err_out:
883 	kfree_skb(skb);
884 	return -EINVAL;
885 }
886 
887 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
888 {
889 	int idx, q_idx;
890 	int s_idx, s_q_idx;
891 	struct net_device *dev;
892 	struct Qdisc *q;
893 
894 	s_idx = cb->args[0];
895 	s_q_idx = q_idx = cb->args[1];
896 	read_lock(&dev_base_lock);
897 	idx = 0;
898 	for_each_netdev(dev) {
899 		if (idx < s_idx)
900 			goto cont;
901 		if (idx > s_idx)
902 			s_q_idx = 0;
903 		q_idx = 0;
904 		list_for_each_entry(q, &dev->qdisc_list, list) {
905 			if (q_idx < s_q_idx) {
906 				q_idx++;
907 				continue;
908 			}
909 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
910 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
911 				goto done;
912 			q_idx++;
913 		}
914 cont:
915 		idx++;
916 	}
917 
918 done:
919 	read_unlock(&dev_base_lock);
920 
921 	cb->args[0] = idx;
922 	cb->args[1] = q_idx;
923 
924 	return skb->len;
925 }
926 
927 
928 
929 /************************************************
930  *	Traffic classes manipulation.		*
931  ************************************************/
932 
933 
934 
935 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
936 {
937 	struct tcmsg *tcm = NLMSG_DATA(n);
938 	struct rtattr **tca = arg;
939 	struct net_device *dev;
940 	struct Qdisc *q = NULL;
941 	struct Qdisc_class_ops *cops;
942 	unsigned long cl = 0;
943 	unsigned long new_cl;
944 	u32 pid = tcm->tcm_parent;
945 	u32 clid = tcm->tcm_handle;
946 	u32 qid = TC_H_MAJ(clid);
947 	int err;
948 
949 	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
950 		return -ENODEV;
951 
952 	/*
953 	   parent == TC_H_UNSPEC - unspecified parent.
954 	   parent == TC_H_ROOT   - class is root, which has no parent.
955 	   parent == X:0	 - parent is root class.
956 	   parent == X:Y	 - parent is a node in hierarchy.
957 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
958 
959 	   handle == 0:0	 - generate handle from kernel pool.
960 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
961 	   handle == X:Y	 - clear.
962 	   handle == X:0	 - root class.
963 	 */
964 
965 	/* Step 1. Determine qdisc handle X:0 */
966 
967 	if (pid != TC_H_ROOT) {
968 		u32 qid1 = TC_H_MAJ(pid);
969 
970 		if (qid && qid1) {
971 			/* If both majors are known, they must be identical. */
972 			if (qid != qid1)
973 				return -EINVAL;
974 		} else if (qid1) {
975 			qid = qid1;
976 		} else if (qid == 0)
977 			qid = dev->qdisc_sleeping->handle;
978 
979 		/* Now qid is genuine qdisc handle consistent
980 		   both with parent and child.
981 
982 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
983 		 */
984 		if (pid)
985 			pid = TC_H_MAKE(qid, pid);
986 	} else {
987 		if (qid == 0)
988 			qid = dev->qdisc_sleeping->handle;
989 	}
990 
991 	/* OK. Locate qdisc */
992 	if ((q = qdisc_lookup(dev, qid)) == NULL)
993 		return -ENOENT;
994 
995 	/* An check that it supports classes */
996 	cops = q->ops->cl_ops;
997 	if (cops == NULL)
998 		return -EINVAL;
999 
1000 	/* Now try to get class */
1001 	if (clid == 0) {
1002 		if (pid == TC_H_ROOT)
1003 			clid = qid;
1004 	} else
1005 		clid = TC_H_MAKE(qid, clid);
1006 
1007 	if (clid)
1008 		cl = cops->get(q, clid);
1009 
1010 	if (cl == 0) {
1011 		err = -ENOENT;
1012 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1013 			goto out;
1014 	} else {
1015 		switch (n->nlmsg_type) {
1016 		case RTM_NEWTCLASS:
1017 			err = -EEXIST;
1018 			if (n->nlmsg_flags&NLM_F_EXCL)
1019 				goto out;
1020 			break;
1021 		case RTM_DELTCLASS:
1022 			err = cops->delete(q, cl);
1023 			if (err == 0)
1024 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1025 			goto out;
1026 		case RTM_GETTCLASS:
1027 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1028 			goto out;
1029 		default:
1030 			err = -EINVAL;
1031 			goto out;
1032 		}
1033 	}
1034 
1035 	new_cl = cl;
1036 	err = cops->change(q, clid, pid, tca, &new_cl);
1037 	if (err == 0)
1038 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1039 
1040 out:
1041 	if (cl)
1042 		cops->put(q, cl);
1043 
1044 	return err;
1045 }
1046 
1047 
1048 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1049 			  unsigned long cl,
1050 			  u32 pid, u32 seq, u16 flags, int event)
1051 {
1052 	struct tcmsg *tcm;
1053 	struct nlmsghdr  *nlh;
1054 	unsigned char *b = skb_tail_pointer(skb);
1055 	struct gnet_dump d;
1056 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1057 
1058 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1059 	tcm = NLMSG_DATA(nlh);
1060 	tcm->tcm_family = AF_UNSPEC;
1061 	tcm->tcm_ifindex = q->dev->ifindex;
1062 	tcm->tcm_parent = q->handle;
1063 	tcm->tcm_handle = q->handle;
1064 	tcm->tcm_info = 0;
1065 	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1066 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1067 		goto rtattr_failure;
1068 
1069 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1070 			TCA_XSTATS, q->stats_lock, &d) < 0)
1071 		goto rtattr_failure;
1072 
1073 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1074 		goto rtattr_failure;
1075 
1076 	if (gnet_stats_finish_copy(&d) < 0)
1077 		goto rtattr_failure;
1078 
1079 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1080 	return skb->len;
1081 
1082 nlmsg_failure:
1083 rtattr_failure:
1084 	nlmsg_trim(skb, b);
1085 	return -1;
1086 }
1087 
1088 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1089 			  struct Qdisc *q, unsigned long cl, int event)
1090 {
1091 	struct sk_buff *skb;
1092 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1093 
1094 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1095 	if (!skb)
1096 		return -ENOBUFS;
1097 
1098 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1099 		kfree_skb(skb);
1100 		return -EINVAL;
1101 	}
1102 
1103 	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1104 }
1105 
1106 struct qdisc_dump_args
1107 {
1108 	struct qdisc_walker w;
1109 	struct sk_buff *skb;
1110 	struct netlink_callback *cb;
1111 };
1112 
1113 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1114 {
1115 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1116 
1117 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1118 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1119 }
1120 
1121 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1122 {
1123 	int t;
1124 	int s_t;
1125 	struct net_device *dev;
1126 	struct Qdisc *q;
1127 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1128 	struct qdisc_dump_args arg;
1129 
1130 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1131 		return 0;
1132 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1133 		return 0;
1134 
1135 	s_t = cb->args[0];
1136 	t = 0;
1137 
1138 	list_for_each_entry(q, &dev->qdisc_list, list) {
1139 		if (t < s_t || !q->ops->cl_ops ||
1140 		    (tcm->tcm_parent &&
1141 		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1142 			t++;
1143 			continue;
1144 		}
1145 		if (t > s_t)
1146 			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1147 		arg.w.fn = qdisc_class_dump;
1148 		arg.skb = skb;
1149 		arg.cb = cb;
1150 		arg.w.stop  = 0;
1151 		arg.w.skip = cb->args[1];
1152 		arg.w.count = 0;
1153 		q->ops->cl_ops->walk(q, &arg.w);
1154 		cb->args[1] = arg.w.count;
1155 		if (arg.w.stop)
1156 			break;
1157 		t++;
1158 	}
1159 
1160 	cb->args[0] = t;
1161 
1162 	dev_put(dev);
1163 	return skb->len;
1164 }
1165 
1166 /* Main classifier routine: scans classifier chain attached
1167    to this qdisc, (optionally) tests for protocol and asks
1168    specific classifiers.
1169  */
1170 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1171 	struct tcf_result *res)
1172 {
1173 	int err = 0;
1174 	__be16 protocol = skb->protocol;
1175 #ifdef CONFIG_NET_CLS_ACT
1176 	struct tcf_proto *otp = tp;
1177 reclassify:
1178 #endif
1179 	protocol = skb->protocol;
1180 
1181 	for ( ; tp; tp = tp->next) {
1182 		if ((tp->protocol == protocol ||
1183 			tp->protocol == htons(ETH_P_ALL)) &&
1184 			(err = tp->classify(skb, tp, res)) >= 0) {
1185 #ifdef CONFIG_NET_CLS_ACT
1186 			if ( TC_ACT_RECLASSIFY == err) {
1187 				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1188 				tp = otp;
1189 
1190 				if (MAX_REC_LOOP < verd++) {
1191 					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1192 						tp->prio&0xffff, ntohs(tp->protocol));
1193 					return TC_ACT_SHOT;
1194 				}
1195 				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1196 				goto reclassify;
1197 			} else {
1198 				if (skb->tc_verd)
1199 					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1200 				return err;
1201 			}
1202 #else
1203 
1204 			return err;
1205 #endif
1206 		}
1207 
1208 	}
1209 	return -1;
1210 }
1211 
1212 void tcf_destroy(struct tcf_proto *tp)
1213 {
1214 	tp->ops->destroy(tp);
1215 	module_put(tp->ops->owner);
1216 	kfree(tp);
1217 }
1218 
1219 void tcf_destroy_chain(struct tcf_proto *fl)
1220 {
1221 	struct tcf_proto *tp;
1222 
1223 	while ((tp = fl) != NULL) {
1224 		fl = tp->next;
1225 		tcf_destroy(tp);
1226 	}
1227 }
1228 EXPORT_SYMBOL(tcf_destroy_chain);
1229 
1230 #ifdef CONFIG_PROC_FS
1231 static int psched_show(struct seq_file *seq, void *v)
1232 {
1233 	seq_printf(seq, "%08x %08x %08x %08x\n",
1234 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1235 		   1000000,
1236 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(KTIME_MONOTONIC_RES));
1237 
1238 	return 0;
1239 }
1240 
1241 static int psched_open(struct inode *inode, struct file *file)
1242 {
1243 	return single_open(file, psched_show, PDE(inode)->data);
1244 }
1245 
1246 static const struct file_operations psched_fops = {
1247 	.owner = THIS_MODULE,
1248 	.open = psched_open,
1249 	.read  = seq_read,
1250 	.llseek = seq_lseek,
1251 	.release = single_release,
1252 };
1253 #endif
1254 
1255 static int __init pktsched_init(void)
1256 {
1257 	register_qdisc(&pfifo_qdisc_ops);
1258 	register_qdisc(&bfifo_qdisc_ops);
1259 	proc_net_fops_create("psched", 0, &psched_fops);
1260 
1261 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1262 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1263 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1264 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1265 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1266 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1267 
1268 	return 0;
1269 }
1270 
1271 subsys_initcall(pktsched_init);
1272 
1273 EXPORT_SYMBOL(qdisc_get_rtab);
1274 EXPORT_SYMBOL(qdisc_put_rtab);
1275 EXPORT_SYMBOL(register_qdisc);
1276 EXPORT_SYMBOL(unregister_qdisc);
1277 EXPORT_SYMBOL(tc_classify);
1278