xref: /openbmc/linux/net/sched/sch_api.c (revision b6dcefde)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
211 }
212 
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	struct Qdisc *q;
223 
224 	q = qdisc_match_from_root(dev->qdisc, handle);
225 	if (q)
226 		goto out;
227 
228 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
229 out:
230 	return q;
231 }
232 
233 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
234 {
235 	unsigned long cl;
236 	struct Qdisc *leaf;
237 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
238 
239 	if (cops == NULL)
240 		return NULL;
241 	cl = cops->get(p, classid);
242 
243 	if (cl == 0)
244 		return NULL;
245 	leaf = cops->leaf(p, cl);
246 	cops->put(p, cl);
247 	return leaf;
248 }
249 
250 /* Find queueing discipline by name */
251 
252 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
253 {
254 	struct Qdisc_ops *q = NULL;
255 
256 	if (kind) {
257 		read_lock(&qdisc_mod_lock);
258 		for (q = qdisc_base; q; q = q->next) {
259 			if (nla_strcmp(kind, q->id) == 0) {
260 				if (!try_module_get(q->owner))
261 					q = NULL;
262 				break;
263 			}
264 		}
265 		read_unlock(&qdisc_mod_lock);
266 	}
267 	return q;
268 }
269 
270 static struct qdisc_rate_table *qdisc_rtab_list;
271 
272 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
273 {
274 	struct qdisc_rate_table *rtab;
275 
276 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
277 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
278 			rtab->refcnt++;
279 			return rtab;
280 		}
281 	}
282 
283 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
284 	    nla_len(tab) != TC_RTAB_SIZE)
285 		return NULL;
286 
287 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
288 	if (rtab) {
289 		rtab->rate = *r;
290 		rtab->refcnt = 1;
291 		memcpy(rtab->data, nla_data(tab), 1024);
292 		rtab->next = qdisc_rtab_list;
293 		qdisc_rtab_list = rtab;
294 	}
295 	return rtab;
296 }
297 EXPORT_SYMBOL(qdisc_get_rtab);
298 
299 void qdisc_put_rtab(struct qdisc_rate_table *tab)
300 {
301 	struct qdisc_rate_table *rtab, **rtabp;
302 
303 	if (!tab || --tab->refcnt)
304 		return;
305 
306 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
307 		if (rtab == tab) {
308 			*rtabp = rtab->next;
309 			kfree(rtab);
310 			return;
311 		}
312 	}
313 }
314 EXPORT_SYMBOL(qdisc_put_rtab);
315 
316 static LIST_HEAD(qdisc_stab_list);
317 static DEFINE_SPINLOCK(qdisc_stab_lock);
318 
319 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
320 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
321 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
322 };
323 
324 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
325 {
326 	struct nlattr *tb[TCA_STAB_MAX + 1];
327 	struct qdisc_size_table *stab;
328 	struct tc_sizespec *s;
329 	unsigned int tsize = 0;
330 	u16 *tab = NULL;
331 	int err;
332 
333 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
334 	if (err < 0)
335 		return ERR_PTR(err);
336 	if (!tb[TCA_STAB_BASE])
337 		return ERR_PTR(-EINVAL);
338 
339 	s = nla_data(tb[TCA_STAB_BASE]);
340 
341 	if (s->tsize > 0) {
342 		if (!tb[TCA_STAB_DATA])
343 			return ERR_PTR(-EINVAL);
344 		tab = nla_data(tb[TCA_STAB_DATA]);
345 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
346 	}
347 
348 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
349 		return ERR_PTR(-EINVAL);
350 
351 	spin_lock(&qdisc_stab_lock);
352 
353 	list_for_each_entry(stab, &qdisc_stab_list, list) {
354 		if (memcmp(&stab->szopts, s, sizeof(*s)))
355 			continue;
356 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
357 			continue;
358 		stab->refcnt++;
359 		spin_unlock(&qdisc_stab_lock);
360 		return stab;
361 	}
362 
363 	spin_unlock(&qdisc_stab_lock);
364 
365 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
366 	if (!stab)
367 		return ERR_PTR(-ENOMEM);
368 
369 	stab->refcnt = 1;
370 	stab->szopts = *s;
371 	if (tsize > 0)
372 		memcpy(stab->data, tab, tsize * sizeof(u16));
373 
374 	spin_lock(&qdisc_stab_lock);
375 	list_add_tail(&stab->list, &qdisc_stab_list);
376 	spin_unlock(&qdisc_stab_lock);
377 
378 	return stab;
379 }
380 
381 void qdisc_put_stab(struct qdisc_size_table *tab)
382 {
383 	if (!tab)
384 		return;
385 
386 	spin_lock(&qdisc_stab_lock);
387 
388 	if (--tab->refcnt == 0) {
389 		list_del(&tab->list);
390 		kfree(tab);
391 	}
392 
393 	spin_unlock(&qdisc_stab_lock);
394 }
395 EXPORT_SYMBOL(qdisc_put_stab);
396 
397 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
398 {
399 	struct nlattr *nest;
400 
401 	nest = nla_nest_start(skb, TCA_STAB);
402 	if (nest == NULL)
403 		goto nla_put_failure;
404 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
405 	nla_nest_end(skb, nest);
406 
407 	return skb->len;
408 
409 nla_put_failure:
410 	return -1;
411 }
412 
413 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
414 {
415 	int pkt_len, slot;
416 
417 	pkt_len = skb->len + stab->szopts.overhead;
418 	if (unlikely(!stab->szopts.tsize))
419 		goto out;
420 
421 	slot = pkt_len + stab->szopts.cell_align;
422 	if (unlikely(slot < 0))
423 		slot = 0;
424 
425 	slot >>= stab->szopts.cell_log;
426 	if (likely(slot < stab->szopts.tsize))
427 		pkt_len = stab->data[slot];
428 	else
429 		pkt_len = stab->data[stab->szopts.tsize - 1] *
430 				(slot / stab->szopts.tsize) +
431 				stab->data[slot % stab->szopts.tsize];
432 
433 	pkt_len <<= stab->szopts.size_log;
434 out:
435 	if (unlikely(pkt_len < 1))
436 		pkt_len = 1;
437 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
438 }
439 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
440 
441 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
442 {
443 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
444 		printk(KERN_WARNING
445 		       "%s: %s qdisc %X: is non-work-conserving?\n",
446 		       txt, qdisc->ops->id, qdisc->handle >> 16);
447 		qdisc->flags |= TCQ_F_WARN_NONWC;
448 	}
449 }
450 EXPORT_SYMBOL(qdisc_warn_nonwc);
451 
452 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
453 {
454 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
455 						 timer);
456 
457 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
458 	__netif_schedule(qdisc_root(wd->qdisc));
459 
460 	return HRTIMER_NORESTART;
461 }
462 
463 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
464 {
465 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
466 	wd->timer.function = qdisc_watchdog;
467 	wd->qdisc = qdisc;
468 }
469 EXPORT_SYMBOL(qdisc_watchdog_init);
470 
471 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
472 {
473 	ktime_t time;
474 
475 	if (test_bit(__QDISC_STATE_DEACTIVATED,
476 		     &qdisc_root_sleeping(wd->qdisc)->state))
477 		return;
478 
479 	wd->qdisc->flags |= TCQ_F_THROTTLED;
480 	time = ktime_set(0, 0);
481 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
482 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
483 }
484 EXPORT_SYMBOL(qdisc_watchdog_schedule);
485 
486 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
487 {
488 	hrtimer_cancel(&wd->timer);
489 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
490 }
491 EXPORT_SYMBOL(qdisc_watchdog_cancel);
492 
493 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
494 {
495 	unsigned int size = n * sizeof(struct hlist_head), i;
496 	struct hlist_head *h;
497 
498 	if (size <= PAGE_SIZE)
499 		h = kmalloc(size, GFP_KERNEL);
500 	else
501 		h = (struct hlist_head *)
502 			__get_free_pages(GFP_KERNEL, get_order(size));
503 
504 	if (h != NULL) {
505 		for (i = 0; i < n; i++)
506 			INIT_HLIST_HEAD(&h[i]);
507 	}
508 	return h;
509 }
510 
511 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
512 {
513 	unsigned int size = n * sizeof(struct hlist_head);
514 
515 	if (size <= PAGE_SIZE)
516 		kfree(h);
517 	else
518 		free_pages((unsigned long)h, get_order(size));
519 }
520 
521 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
522 {
523 	struct Qdisc_class_common *cl;
524 	struct hlist_node *n, *next;
525 	struct hlist_head *nhash, *ohash;
526 	unsigned int nsize, nmask, osize;
527 	unsigned int i, h;
528 
529 	/* Rehash when load factor exceeds 0.75 */
530 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
531 		return;
532 	nsize = clhash->hashsize * 2;
533 	nmask = nsize - 1;
534 	nhash = qdisc_class_hash_alloc(nsize);
535 	if (nhash == NULL)
536 		return;
537 
538 	ohash = clhash->hash;
539 	osize = clhash->hashsize;
540 
541 	sch_tree_lock(sch);
542 	for (i = 0; i < osize; i++) {
543 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
544 			h = qdisc_class_hash(cl->classid, nmask);
545 			hlist_add_head(&cl->hnode, &nhash[h]);
546 		}
547 	}
548 	clhash->hash     = nhash;
549 	clhash->hashsize = nsize;
550 	clhash->hashmask = nmask;
551 	sch_tree_unlock(sch);
552 
553 	qdisc_class_hash_free(ohash, osize);
554 }
555 EXPORT_SYMBOL(qdisc_class_hash_grow);
556 
557 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
558 {
559 	unsigned int size = 4;
560 
561 	clhash->hash = qdisc_class_hash_alloc(size);
562 	if (clhash->hash == NULL)
563 		return -ENOMEM;
564 	clhash->hashsize  = size;
565 	clhash->hashmask  = size - 1;
566 	clhash->hashelems = 0;
567 	return 0;
568 }
569 EXPORT_SYMBOL(qdisc_class_hash_init);
570 
571 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
572 {
573 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_destroy);
576 
577 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
578 			     struct Qdisc_class_common *cl)
579 {
580 	unsigned int h;
581 
582 	INIT_HLIST_NODE(&cl->hnode);
583 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
584 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
585 	clhash->hashelems++;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_insert);
588 
589 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
590 			     struct Qdisc_class_common *cl)
591 {
592 	hlist_del(&cl->hnode);
593 	clhash->hashelems--;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_remove);
596 
597 /* Allocate an unique handle from space managed by kernel */
598 
599 static u32 qdisc_alloc_handle(struct net_device *dev)
600 {
601 	int i = 0x10000;
602 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
603 
604 	do {
605 		autohandle += TC_H_MAKE(0x10000U, 0);
606 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
607 			autohandle = TC_H_MAKE(0x80000000U, 0);
608 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
609 
610 	return i>0 ? autohandle : 0;
611 }
612 
613 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
614 {
615 	const struct Qdisc_class_ops *cops;
616 	unsigned long cl;
617 	u32 parentid;
618 
619 	if (n == 0)
620 		return;
621 	while ((parentid = sch->parent)) {
622 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
623 			return;
624 
625 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
626 		if (sch == NULL) {
627 			WARN_ON(parentid != TC_H_ROOT);
628 			return;
629 		}
630 		cops = sch->ops->cl_ops;
631 		if (cops->qlen_notify) {
632 			cl = cops->get(sch, parentid);
633 			cops->qlen_notify(sch, cl);
634 			cops->put(sch, cl);
635 		}
636 		sch->q.qlen -= n;
637 	}
638 }
639 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
640 
641 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
642 			       struct Qdisc *old, struct Qdisc *new)
643 {
644 	if (new || old)
645 		qdisc_notify(skb, n, clid, old, new);
646 
647 	if (old)
648 		qdisc_destroy(old);
649 }
650 
651 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
652  * to device "dev".
653  *
654  * When appropriate send a netlink notification using 'skb'
655  * and "n".
656  *
657  * On success, destroy old qdisc.
658  */
659 
660 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
661 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
662 		       struct Qdisc *new, struct Qdisc *old)
663 {
664 	struct Qdisc *q = old;
665 	int err = 0;
666 
667 	if (parent == NULL) {
668 		unsigned int i, num_q, ingress;
669 
670 		ingress = 0;
671 		num_q = dev->num_tx_queues;
672 		if ((q && q->flags & TCQ_F_INGRESS) ||
673 		    (new && new->flags & TCQ_F_INGRESS)) {
674 			num_q = 1;
675 			ingress = 1;
676 		}
677 
678 		if (dev->flags & IFF_UP)
679 			dev_deactivate(dev);
680 
681 		if (new && new->ops->attach) {
682 			new->ops->attach(new);
683 			num_q = 0;
684 		}
685 
686 		for (i = 0; i < num_q; i++) {
687 			struct netdev_queue *dev_queue = &dev->rx_queue;
688 
689 			if (!ingress)
690 				dev_queue = netdev_get_tx_queue(dev, i);
691 
692 			old = dev_graft_qdisc(dev_queue, new);
693 			if (new && i > 0)
694 				atomic_inc(&new->refcnt);
695 
696 			if (!ingress)
697 				qdisc_destroy(old);
698 		}
699 
700 		if (!ingress) {
701 			notify_and_destroy(skb, n, classid, dev->qdisc, new);
702 			if (new && !new->ops->attach)
703 				atomic_inc(&new->refcnt);
704 			dev->qdisc = new ? : &noop_qdisc;
705 		} else {
706 			notify_and_destroy(skb, n, classid, old, new);
707 		}
708 
709 		if (dev->flags & IFF_UP)
710 			dev_activate(dev);
711 	} else {
712 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
713 
714 		err = -EOPNOTSUPP;
715 		if (cops && cops->graft) {
716 			unsigned long cl = cops->get(parent, classid);
717 			if (cl) {
718 				err = cops->graft(parent, cl, new, &old);
719 				cops->put(parent, cl);
720 			} else
721 				err = -ENOENT;
722 		}
723 		if (!err)
724 			notify_and_destroy(skb, n, classid, old, new);
725 	}
726 	return err;
727 }
728 
729 /* lockdep annotation is needed for ingress; egress gets it only for name */
730 static struct lock_class_key qdisc_tx_lock;
731 static struct lock_class_key qdisc_rx_lock;
732 
733 /*
734    Allocate and initialize new qdisc.
735 
736    Parameters are passed via opt.
737  */
738 
739 static struct Qdisc *
740 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
741 	     struct Qdisc *p, u32 parent, u32 handle,
742 	     struct nlattr **tca, int *errp)
743 {
744 	int err;
745 	struct nlattr *kind = tca[TCA_KIND];
746 	struct Qdisc *sch;
747 	struct Qdisc_ops *ops;
748 	struct qdisc_size_table *stab;
749 
750 	ops = qdisc_lookup_ops(kind);
751 #ifdef CONFIG_MODULES
752 	if (ops == NULL && kind != NULL) {
753 		char name[IFNAMSIZ];
754 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
755 			/* We dropped the RTNL semaphore in order to
756 			 * perform the module load.  So, even if we
757 			 * succeeded in loading the module we have to
758 			 * tell the caller to replay the request.  We
759 			 * indicate this using -EAGAIN.
760 			 * We replay the request because the device may
761 			 * go away in the mean time.
762 			 */
763 			rtnl_unlock();
764 			request_module("sch_%s", name);
765 			rtnl_lock();
766 			ops = qdisc_lookup_ops(kind);
767 			if (ops != NULL) {
768 				/* We will try again qdisc_lookup_ops,
769 				 * so don't keep a reference.
770 				 */
771 				module_put(ops->owner);
772 				err = -EAGAIN;
773 				goto err_out;
774 			}
775 		}
776 	}
777 #endif
778 
779 	err = -ENOENT;
780 	if (ops == NULL)
781 		goto err_out;
782 
783 	sch = qdisc_alloc(dev_queue, ops);
784 	if (IS_ERR(sch)) {
785 		err = PTR_ERR(sch);
786 		goto err_out2;
787 	}
788 
789 	sch->parent = parent;
790 
791 	if (handle == TC_H_INGRESS) {
792 		sch->flags |= TCQ_F_INGRESS;
793 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
794 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
795 	} else {
796 		if (handle == 0) {
797 			handle = qdisc_alloc_handle(dev);
798 			err = -ENOMEM;
799 			if (handle == 0)
800 				goto err_out3;
801 		}
802 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
803 	}
804 
805 	sch->handle = handle;
806 
807 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
808 		if (tca[TCA_STAB]) {
809 			stab = qdisc_get_stab(tca[TCA_STAB]);
810 			if (IS_ERR(stab)) {
811 				err = PTR_ERR(stab);
812 				goto err_out4;
813 			}
814 			sch->stab = stab;
815 		}
816 		if (tca[TCA_RATE]) {
817 			spinlock_t *root_lock;
818 
819 			err = -EOPNOTSUPP;
820 			if (sch->flags & TCQ_F_MQROOT)
821 				goto err_out4;
822 
823 			if ((sch->parent != TC_H_ROOT) &&
824 			    !(sch->flags & TCQ_F_INGRESS) &&
825 			    (!p || !(p->flags & TCQ_F_MQROOT)))
826 				root_lock = qdisc_root_sleeping_lock(sch);
827 			else
828 				root_lock = qdisc_lock(sch);
829 
830 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
831 						root_lock, tca[TCA_RATE]);
832 			if (err)
833 				goto err_out4;
834 		}
835 
836 		qdisc_list_add(sch);
837 
838 		return sch;
839 	}
840 err_out3:
841 	dev_put(dev);
842 	kfree((char *) sch - sch->padded);
843 err_out2:
844 	module_put(ops->owner);
845 err_out:
846 	*errp = err;
847 	return NULL;
848 
849 err_out4:
850 	/*
851 	 * Any broken qdiscs that would require a ops->reset() here?
852 	 * The qdisc was never in action so it shouldn't be necessary.
853 	 */
854 	qdisc_put_stab(sch->stab);
855 	if (ops->destroy)
856 		ops->destroy(sch);
857 	goto err_out3;
858 }
859 
860 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
861 {
862 	struct qdisc_size_table *stab = NULL;
863 	int err = 0;
864 
865 	if (tca[TCA_OPTIONS]) {
866 		if (sch->ops->change == NULL)
867 			return -EINVAL;
868 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
869 		if (err)
870 			return err;
871 	}
872 
873 	if (tca[TCA_STAB]) {
874 		stab = qdisc_get_stab(tca[TCA_STAB]);
875 		if (IS_ERR(stab))
876 			return PTR_ERR(stab);
877 	}
878 
879 	qdisc_put_stab(sch->stab);
880 	sch->stab = stab;
881 
882 	if (tca[TCA_RATE]) {
883 		/* NB: ignores errors from replace_estimator
884 		   because change can't be undone. */
885 		if (sch->flags & TCQ_F_MQROOT)
886 			goto out;
887 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
888 					    qdisc_root_sleeping_lock(sch),
889 					    tca[TCA_RATE]);
890 	}
891 out:
892 	return 0;
893 }
894 
895 struct check_loop_arg
896 {
897 	struct qdisc_walker 	w;
898 	struct Qdisc		*p;
899 	int			depth;
900 };
901 
902 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
903 
904 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
905 {
906 	struct check_loop_arg	arg;
907 
908 	if (q->ops->cl_ops == NULL)
909 		return 0;
910 
911 	arg.w.stop = arg.w.skip = arg.w.count = 0;
912 	arg.w.fn = check_loop_fn;
913 	arg.depth = depth;
914 	arg.p = p;
915 	q->ops->cl_ops->walk(q, &arg.w);
916 	return arg.w.stop ? -ELOOP : 0;
917 }
918 
919 static int
920 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
921 {
922 	struct Qdisc *leaf;
923 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
924 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
925 
926 	leaf = cops->leaf(q, cl);
927 	if (leaf) {
928 		if (leaf == arg->p || arg->depth > 7)
929 			return -ELOOP;
930 		return check_loop(leaf, arg->p, arg->depth + 1);
931 	}
932 	return 0;
933 }
934 
935 /*
936  * Delete/get qdisc.
937  */
938 
939 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
940 {
941 	struct net *net = sock_net(skb->sk);
942 	struct tcmsg *tcm = NLMSG_DATA(n);
943 	struct nlattr *tca[TCA_MAX + 1];
944 	struct net_device *dev;
945 	u32 clid = tcm->tcm_parent;
946 	struct Qdisc *q = NULL;
947 	struct Qdisc *p = NULL;
948 	int err;
949 
950 	if (!net_eq(net, &init_net))
951 		return -EINVAL;
952 
953 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
954 		return -ENODEV;
955 
956 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
957 	if (err < 0)
958 		return err;
959 
960 	if (clid) {
961 		if (clid != TC_H_ROOT) {
962 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
963 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
964 					return -ENOENT;
965 				q = qdisc_leaf(p, clid);
966 			} else { /* ingress */
967 				q = dev->rx_queue.qdisc_sleeping;
968 			}
969 		} else {
970 			q = dev->qdisc;
971 		}
972 		if (!q)
973 			return -ENOENT;
974 
975 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
976 			return -EINVAL;
977 	} else {
978 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
979 			return -ENOENT;
980 	}
981 
982 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
983 		return -EINVAL;
984 
985 	if (n->nlmsg_type == RTM_DELQDISC) {
986 		if (!clid)
987 			return -EINVAL;
988 		if (q->handle == 0)
989 			return -ENOENT;
990 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
991 			return err;
992 	} else {
993 		qdisc_notify(skb, n, clid, NULL, q);
994 	}
995 	return 0;
996 }
997 
998 /*
999    Create/change qdisc.
1000  */
1001 
1002 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1003 {
1004 	struct net *net = sock_net(skb->sk);
1005 	struct tcmsg *tcm;
1006 	struct nlattr *tca[TCA_MAX + 1];
1007 	struct net_device *dev;
1008 	u32 clid;
1009 	struct Qdisc *q, *p;
1010 	int err;
1011 
1012 	if (!net_eq(net, &init_net))
1013 		return -EINVAL;
1014 
1015 replay:
1016 	/* Reinit, just in case something touches this. */
1017 	tcm = NLMSG_DATA(n);
1018 	clid = tcm->tcm_parent;
1019 	q = p = NULL;
1020 
1021 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1022 		return -ENODEV;
1023 
1024 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1025 	if (err < 0)
1026 		return err;
1027 
1028 	if (clid) {
1029 		if (clid != TC_H_ROOT) {
1030 			if (clid != TC_H_INGRESS) {
1031 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032 					return -ENOENT;
1033 				q = qdisc_leaf(p, clid);
1034 			} else { /*ingress */
1035 				q = dev->rx_queue.qdisc_sleeping;
1036 			}
1037 		} else {
1038 			q = dev->qdisc;
1039 		}
1040 
1041 		/* It may be default qdisc, ignore it */
1042 		if (q && q->handle == 0)
1043 			q = NULL;
1044 
1045 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046 			if (tcm->tcm_handle) {
1047 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048 					return -EEXIST;
1049 				if (TC_H_MIN(tcm->tcm_handle))
1050 					return -EINVAL;
1051 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052 					goto create_n_graft;
1053 				if (n->nlmsg_flags&NLM_F_EXCL)
1054 					return -EEXIST;
1055 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056 					return -EINVAL;
1057 				if (q == p ||
1058 				    (p && check_loop(q, p, 0)))
1059 					return -ELOOP;
1060 				atomic_inc(&q->refcnt);
1061 				goto graft;
1062 			} else {
1063 				if (q == NULL)
1064 					goto create_n_graft;
1065 
1066 				/* This magic test requires explanation.
1067 				 *
1068 				 *   We know, that some child q is already
1069 				 *   attached to this parent and have choice:
1070 				 *   either to change it or to create/graft new one.
1071 				 *
1072 				 *   1. We are allowed to create/graft only
1073 				 *   if CREATE and REPLACE flags are set.
1074 				 *
1075 				 *   2. If EXCL is set, requestor wanted to say,
1076 				 *   that qdisc tcm_handle is not expected
1077 				 *   to exist, so that we choose create/graft too.
1078 				 *
1079 				 *   3. The last case is when no flags are set.
1080 				 *   Alas, it is sort of hole in API, we
1081 				 *   cannot decide what to do unambiguously.
1082 				 *   For now we select create/graft, if
1083 				 *   user gave KIND, which does not match existing.
1084 				 */
1085 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088 				     (tca[TCA_KIND] &&
1089 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090 					goto create_n_graft;
1091 			}
1092 		}
1093 	} else {
1094 		if (!tcm->tcm_handle)
1095 			return -EINVAL;
1096 		q = qdisc_lookup(dev, tcm->tcm_handle);
1097 	}
1098 
1099 	/* Change qdisc parameters */
1100 	if (q == NULL)
1101 		return -ENOENT;
1102 	if (n->nlmsg_flags&NLM_F_EXCL)
1103 		return -EEXIST;
1104 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105 		return -EINVAL;
1106 	err = qdisc_change(q, tca);
1107 	if (err == 0)
1108 		qdisc_notify(skb, n, clid, NULL, q);
1109 	return err;
1110 
1111 create_n_graft:
1112 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1113 		return -ENOENT;
1114 	if (clid == TC_H_INGRESS)
1115 		q = qdisc_create(dev, &dev->rx_queue, p,
1116 				 tcm->tcm_parent, tcm->tcm_parent,
1117 				 tca, &err);
1118 	else {
1119 		struct netdev_queue *dev_queue;
1120 
1121 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1122 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1123 		else if (p)
1124 			dev_queue = p->dev_queue;
1125 		else
1126 			dev_queue = netdev_get_tx_queue(dev, 0);
1127 
1128 		q = qdisc_create(dev, dev_queue, p,
1129 				 tcm->tcm_parent, tcm->tcm_handle,
1130 				 tca, &err);
1131 	}
1132 	if (q == NULL) {
1133 		if (err == -EAGAIN)
1134 			goto replay;
1135 		return err;
1136 	}
1137 
1138 graft:
1139 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 	if (err) {
1141 		if (q)
1142 			qdisc_destroy(q);
1143 		return err;
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150 			 u32 pid, u32 seq, u16 flags, int event)
1151 {
1152 	struct tcmsg *tcm;
1153 	struct nlmsghdr  *nlh;
1154 	unsigned char *b = skb_tail_pointer(skb);
1155 	struct gnet_dump d;
1156 
1157 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158 	tcm = NLMSG_DATA(nlh);
1159 	tcm->tcm_family = AF_UNSPEC;
1160 	tcm->tcm__pad1 = 0;
1161 	tcm->tcm__pad2 = 0;
1162 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163 	tcm->tcm_parent = clid;
1164 	tcm->tcm_handle = q->handle;
1165 	tcm->tcm_info = atomic_read(&q->refcnt);
1166 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168 		goto nla_put_failure;
1169 	q->qstats.qlen = q->q.qlen;
1170 
1171 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172 		goto nla_put_failure;
1173 
1174 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175 					 qdisc_root_sleeping_lock(q), &d) < 0)
1176 		goto nla_put_failure;
1177 
1178 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179 		goto nla_put_failure;
1180 
1181 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1183 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184 		goto nla_put_failure;
1185 
1186 	if (gnet_stats_finish_copy(&d) < 0)
1187 		goto nla_put_failure;
1188 
1189 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190 	return skb->len;
1191 
1192 nlmsg_failure:
1193 nla_put_failure:
1194 	nlmsg_trim(skb, b);
1195 	return -1;
1196 }
1197 
1198 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1199 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1200 {
1201 	struct sk_buff *skb;
1202 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203 
1204 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205 	if (!skb)
1206 		return -ENOBUFS;
1207 
1208 	if (old && old->handle) {
1209 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210 			goto err_out;
1211 	}
1212 	if (new) {
1213 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214 			goto err_out;
1215 	}
1216 
1217 	if (skb->len)
1218 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1219 
1220 err_out:
1221 	kfree_skb(skb);
1222 	return -EINVAL;
1223 }
1224 
1225 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226 {
1227 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228 }
1229 
1230 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231 			      struct netlink_callback *cb,
1232 			      int *q_idx_p, int s_q_idx)
1233 {
1234 	int ret = 0, q_idx = *q_idx_p;
1235 	struct Qdisc *q;
1236 
1237 	if (!root)
1238 		return 0;
1239 
1240 	q = root;
1241 	if (q_idx < s_q_idx) {
1242 		q_idx++;
1243 	} else {
1244 		if (!tc_qdisc_dump_ignore(q) &&
1245 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247 			goto done;
1248 		q_idx++;
1249 	}
1250 	list_for_each_entry(q, &root->list, list) {
1251 		if (q_idx < s_q_idx) {
1252 			q_idx++;
1253 			continue;
1254 		}
1255 		if (!tc_qdisc_dump_ignore(q) &&
1256 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258 			goto done;
1259 		q_idx++;
1260 	}
1261 
1262 out:
1263 	*q_idx_p = q_idx;
1264 	return ret;
1265 done:
1266 	ret = -1;
1267 	goto out;
1268 }
1269 
1270 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271 {
1272 	struct net *net = sock_net(skb->sk);
1273 	int idx, q_idx;
1274 	int s_idx, s_q_idx;
1275 	struct net_device *dev;
1276 
1277 	if (!net_eq(net, &init_net))
1278 		return 0;
1279 
1280 	s_idx = cb->args[0];
1281 	s_q_idx = q_idx = cb->args[1];
1282 
1283 	rcu_read_lock();
1284 	idx = 0;
1285 	for_each_netdev_rcu(&init_net, dev) {
1286 		struct netdev_queue *dev_queue;
1287 
1288 		if (idx < s_idx)
1289 			goto cont;
1290 		if (idx > s_idx)
1291 			s_q_idx = 0;
1292 		q_idx = 0;
1293 
1294 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1295 			goto done;
1296 
1297 		dev_queue = &dev->rx_queue;
1298 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1299 			goto done;
1300 
1301 cont:
1302 		idx++;
1303 	}
1304 
1305 done:
1306 	rcu_read_unlock();
1307 
1308 	cb->args[0] = idx;
1309 	cb->args[1] = q_idx;
1310 
1311 	return skb->len;
1312 }
1313 
1314 
1315 
1316 /************************************************
1317  *	Traffic classes manipulation.		*
1318  ************************************************/
1319 
1320 
1321 
1322 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1323 {
1324 	struct net *net = sock_net(skb->sk);
1325 	struct tcmsg *tcm = NLMSG_DATA(n);
1326 	struct nlattr *tca[TCA_MAX + 1];
1327 	struct net_device *dev;
1328 	struct Qdisc *q = NULL;
1329 	const struct Qdisc_class_ops *cops;
1330 	unsigned long cl = 0;
1331 	unsigned long new_cl;
1332 	u32 pid = tcm->tcm_parent;
1333 	u32 clid = tcm->tcm_handle;
1334 	u32 qid = TC_H_MAJ(clid);
1335 	int err;
1336 
1337 	if (!net_eq(net, &init_net))
1338 		return -EINVAL;
1339 
1340 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1341 		return -ENODEV;
1342 
1343 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1344 	if (err < 0)
1345 		return err;
1346 
1347 	/*
1348 	   parent == TC_H_UNSPEC - unspecified parent.
1349 	   parent == TC_H_ROOT   - class is root, which has no parent.
1350 	   parent == X:0	 - parent is root class.
1351 	   parent == X:Y	 - parent is a node in hierarchy.
1352 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1353 
1354 	   handle == 0:0	 - generate handle from kernel pool.
1355 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1356 	   handle == X:Y	 - clear.
1357 	   handle == X:0	 - root class.
1358 	 */
1359 
1360 	/* Step 1. Determine qdisc handle X:0 */
1361 
1362 	if (pid != TC_H_ROOT) {
1363 		u32 qid1 = TC_H_MAJ(pid);
1364 
1365 		if (qid && qid1) {
1366 			/* If both majors are known, they must be identical. */
1367 			if (qid != qid1)
1368 				return -EINVAL;
1369 		} else if (qid1) {
1370 			qid = qid1;
1371 		} else if (qid == 0)
1372 			qid = dev->qdisc->handle;
1373 
1374 		/* Now qid is genuine qdisc handle consistent
1375 		   both with parent and child.
1376 
1377 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1378 		 */
1379 		if (pid)
1380 			pid = TC_H_MAKE(qid, pid);
1381 	} else {
1382 		if (qid == 0)
1383 			qid = dev->qdisc->handle;
1384 	}
1385 
1386 	/* OK. Locate qdisc */
1387 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1388 		return -ENOENT;
1389 
1390 	/* An check that it supports classes */
1391 	cops = q->ops->cl_ops;
1392 	if (cops == NULL)
1393 		return -EINVAL;
1394 
1395 	/* Now try to get class */
1396 	if (clid == 0) {
1397 		if (pid == TC_H_ROOT)
1398 			clid = qid;
1399 	} else
1400 		clid = TC_H_MAKE(qid, clid);
1401 
1402 	if (clid)
1403 		cl = cops->get(q, clid);
1404 
1405 	if (cl == 0) {
1406 		err = -ENOENT;
1407 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1408 			goto out;
1409 	} else {
1410 		switch (n->nlmsg_type) {
1411 		case RTM_NEWTCLASS:
1412 			err = -EEXIST;
1413 			if (n->nlmsg_flags&NLM_F_EXCL)
1414 				goto out;
1415 			break;
1416 		case RTM_DELTCLASS:
1417 			err = -EOPNOTSUPP;
1418 			if (cops->delete)
1419 				err = cops->delete(q, cl);
1420 			if (err == 0)
1421 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1422 			goto out;
1423 		case RTM_GETTCLASS:
1424 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1425 			goto out;
1426 		default:
1427 			err = -EINVAL;
1428 			goto out;
1429 		}
1430 	}
1431 
1432 	new_cl = cl;
1433 	err = -EOPNOTSUPP;
1434 	if (cops->change)
1435 		err = cops->change(q, clid, pid, tca, &new_cl);
1436 	if (err == 0)
1437 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1438 
1439 out:
1440 	if (cl)
1441 		cops->put(q, cl);
1442 
1443 	return err;
1444 }
1445 
1446 
1447 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1448 			  unsigned long cl,
1449 			  u32 pid, u32 seq, u16 flags, int event)
1450 {
1451 	struct tcmsg *tcm;
1452 	struct nlmsghdr  *nlh;
1453 	unsigned char *b = skb_tail_pointer(skb);
1454 	struct gnet_dump d;
1455 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1456 
1457 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1458 	tcm = NLMSG_DATA(nlh);
1459 	tcm->tcm_family = AF_UNSPEC;
1460 	tcm->tcm__pad1 = 0;
1461 	tcm->tcm__pad2 = 0;
1462 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1463 	tcm->tcm_parent = q->handle;
1464 	tcm->tcm_handle = q->handle;
1465 	tcm->tcm_info = 0;
1466 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1467 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1468 		goto nla_put_failure;
1469 
1470 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1471 					 qdisc_root_sleeping_lock(q), &d) < 0)
1472 		goto nla_put_failure;
1473 
1474 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1475 		goto nla_put_failure;
1476 
1477 	if (gnet_stats_finish_copy(&d) < 0)
1478 		goto nla_put_failure;
1479 
1480 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1481 	return skb->len;
1482 
1483 nlmsg_failure:
1484 nla_put_failure:
1485 	nlmsg_trim(skb, b);
1486 	return -1;
1487 }
1488 
1489 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1490 			  struct Qdisc *q, unsigned long cl, int event)
1491 {
1492 	struct sk_buff *skb;
1493 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1494 
1495 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1496 	if (!skb)
1497 		return -ENOBUFS;
1498 
1499 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1500 		kfree_skb(skb);
1501 		return -EINVAL;
1502 	}
1503 
1504 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1505 }
1506 
1507 struct qdisc_dump_args
1508 {
1509 	struct qdisc_walker w;
1510 	struct sk_buff *skb;
1511 	struct netlink_callback *cb;
1512 };
1513 
1514 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1515 {
1516 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1517 
1518 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1519 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1520 }
1521 
1522 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1523 				struct tcmsg *tcm, struct netlink_callback *cb,
1524 				int *t_p, int s_t)
1525 {
1526 	struct qdisc_dump_args arg;
1527 
1528 	if (tc_qdisc_dump_ignore(q) ||
1529 	    *t_p < s_t || !q->ops->cl_ops ||
1530 	    (tcm->tcm_parent &&
1531 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1532 		(*t_p)++;
1533 		return 0;
1534 	}
1535 	if (*t_p > s_t)
1536 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1537 	arg.w.fn = qdisc_class_dump;
1538 	arg.skb = skb;
1539 	arg.cb = cb;
1540 	arg.w.stop  = 0;
1541 	arg.w.skip = cb->args[1];
1542 	arg.w.count = 0;
1543 	q->ops->cl_ops->walk(q, &arg.w);
1544 	cb->args[1] = arg.w.count;
1545 	if (arg.w.stop)
1546 		return -1;
1547 	(*t_p)++;
1548 	return 0;
1549 }
1550 
1551 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1552 			       struct tcmsg *tcm, struct netlink_callback *cb,
1553 			       int *t_p, int s_t)
1554 {
1555 	struct Qdisc *q;
1556 
1557 	if (!root)
1558 		return 0;
1559 
1560 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1561 		return -1;
1562 
1563 	list_for_each_entry(q, &root->list, list) {
1564 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1565 			return -1;
1566 	}
1567 
1568 	return 0;
1569 }
1570 
1571 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1572 {
1573 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1574 	struct net *net = sock_net(skb->sk);
1575 	struct netdev_queue *dev_queue;
1576 	struct net_device *dev;
1577 	int t, s_t;
1578 
1579 	if (!net_eq(net, &init_net))
1580 		return 0;
1581 
1582 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1583 		return 0;
1584 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1585 		return 0;
1586 
1587 	s_t = cb->args[0];
1588 	t = 0;
1589 
1590 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1591 		goto done;
1592 
1593 	dev_queue = &dev->rx_queue;
1594 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1595 		goto done;
1596 
1597 done:
1598 	cb->args[0] = t;
1599 
1600 	dev_put(dev);
1601 	return skb->len;
1602 }
1603 
1604 /* Main classifier routine: scans classifier chain attached
1605    to this qdisc, (optionally) tests for protocol and asks
1606    specific classifiers.
1607  */
1608 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1609 		       struct tcf_result *res)
1610 {
1611 	__be16 protocol = skb->protocol;
1612 	int err = 0;
1613 
1614 	for (; tp; tp = tp->next) {
1615 		if ((tp->protocol == protocol ||
1616 		     tp->protocol == htons(ETH_P_ALL)) &&
1617 		    (err = tp->classify(skb, tp, res)) >= 0) {
1618 #ifdef CONFIG_NET_CLS_ACT
1619 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1620 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1621 #endif
1622 			return err;
1623 		}
1624 	}
1625 	return -1;
1626 }
1627 EXPORT_SYMBOL(tc_classify_compat);
1628 
1629 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1630 		struct tcf_result *res)
1631 {
1632 	int err = 0;
1633 	__be16 protocol;
1634 #ifdef CONFIG_NET_CLS_ACT
1635 	struct tcf_proto *otp = tp;
1636 reclassify:
1637 #endif
1638 	protocol = skb->protocol;
1639 
1640 	err = tc_classify_compat(skb, tp, res);
1641 #ifdef CONFIG_NET_CLS_ACT
1642 	if (err == TC_ACT_RECLASSIFY) {
1643 		u32 verd = G_TC_VERD(skb->tc_verd);
1644 		tp = otp;
1645 
1646 		if (verd++ >= MAX_REC_LOOP) {
1647 			printk("rule prio %u protocol %02x reclassify loop, "
1648 			       "packet dropped\n",
1649 			       tp->prio&0xffff, ntohs(tp->protocol));
1650 			return TC_ACT_SHOT;
1651 		}
1652 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1653 		goto reclassify;
1654 	}
1655 #endif
1656 	return err;
1657 }
1658 EXPORT_SYMBOL(tc_classify);
1659 
1660 void tcf_destroy(struct tcf_proto *tp)
1661 {
1662 	tp->ops->destroy(tp);
1663 	module_put(tp->ops->owner);
1664 	kfree(tp);
1665 }
1666 
1667 void tcf_destroy_chain(struct tcf_proto **fl)
1668 {
1669 	struct tcf_proto *tp;
1670 
1671 	while ((tp = *fl) != NULL) {
1672 		*fl = tp->next;
1673 		tcf_destroy(tp);
1674 	}
1675 }
1676 EXPORT_SYMBOL(tcf_destroy_chain);
1677 
1678 #ifdef CONFIG_PROC_FS
1679 static int psched_show(struct seq_file *seq, void *v)
1680 {
1681 	struct timespec ts;
1682 
1683 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1684 	seq_printf(seq, "%08x %08x %08x %08x\n",
1685 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1686 		   1000000,
1687 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1688 
1689 	return 0;
1690 }
1691 
1692 static int psched_open(struct inode *inode, struct file *file)
1693 {
1694 	return single_open(file, psched_show, PDE(inode)->data);
1695 }
1696 
1697 static const struct file_operations psched_fops = {
1698 	.owner = THIS_MODULE,
1699 	.open = psched_open,
1700 	.read  = seq_read,
1701 	.llseek = seq_lseek,
1702 	.release = single_release,
1703 };
1704 #endif
1705 
1706 static int __init pktsched_init(void)
1707 {
1708 	register_qdisc(&pfifo_qdisc_ops);
1709 	register_qdisc(&bfifo_qdisc_ops);
1710 	register_qdisc(&mq_qdisc_ops);
1711 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1712 
1713 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1714 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1715 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1716 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1717 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1718 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1719 
1720 	return 0;
1721 }
1722 
1723 subsys_initcall(pktsched_init);
1724