xref: /openbmc/linux/net/sched/sch_api.c (revision 4bdf0bb7)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
211 }
212 
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	struct Qdisc *q;
223 
224 	q = qdisc_match_from_root(dev->qdisc, handle);
225 	if (q)
226 		goto out;
227 
228 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
229 out:
230 	return q;
231 }
232 
233 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
234 {
235 	unsigned long cl;
236 	struct Qdisc *leaf;
237 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
238 
239 	if (cops == NULL)
240 		return NULL;
241 	cl = cops->get(p, classid);
242 
243 	if (cl == 0)
244 		return NULL;
245 	leaf = cops->leaf(p, cl);
246 	cops->put(p, cl);
247 	return leaf;
248 }
249 
250 /* Find queueing discipline by name */
251 
252 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
253 {
254 	struct Qdisc_ops *q = NULL;
255 
256 	if (kind) {
257 		read_lock(&qdisc_mod_lock);
258 		for (q = qdisc_base; q; q = q->next) {
259 			if (nla_strcmp(kind, q->id) == 0) {
260 				if (!try_module_get(q->owner))
261 					q = NULL;
262 				break;
263 			}
264 		}
265 		read_unlock(&qdisc_mod_lock);
266 	}
267 	return q;
268 }
269 
270 static struct qdisc_rate_table *qdisc_rtab_list;
271 
272 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
273 {
274 	struct qdisc_rate_table *rtab;
275 
276 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
277 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
278 			rtab->refcnt++;
279 			return rtab;
280 		}
281 	}
282 
283 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
284 	    nla_len(tab) != TC_RTAB_SIZE)
285 		return NULL;
286 
287 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
288 	if (rtab) {
289 		rtab->rate = *r;
290 		rtab->refcnt = 1;
291 		memcpy(rtab->data, nla_data(tab), 1024);
292 		rtab->next = qdisc_rtab_list;
293 		qdisc_rtab_list = rtab;
294 	}
295 	return rtab;
296 }
297 EXPORT_SYMBOL(qdisc_get_rtab);
298 
299 void qdisc_put_rtab(struct qdisc_rate_table *tab)
300 {
301 	struct qdisc_rate_table *rtab, **rtabp;
302 
303 	if (!tab || --tab->refcnt)
304 		return;
305 
306 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
307 		if (rtab == tab) {
308 			*rtabp = rtab->next;
309 			kfree(rtab);
310 			return;
311 		}
312 	}
313 }
314 EXPORT_SYMBOL(qdisc_put_rtab);
315 
316 static LIST_HEAD(qdisc_stab_list);
317 static DEFINE_SPINLOCK(qdisc_stab_lock);
318 
319 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
320 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
321 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
322 };
323 
324 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
325 {
326 	struct nlattr *tb[TCA_STAB_MAX + 1];
327 	struct qdisc_size_table *stab;
328 	struct tc_sizespec *s;
329 	unsigned int tsize = 0;
330 	u16 *tab = NULL;
331 	int err;
332 
333 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
334 	if (err < 0)
335 		return ERR_PTR(err);
336 	if (!tb[TCA_STAB_BASE])
337 		return ERR_PTR(-EINVAL);
338 
339 	s = nla_data(tb[TCA_STAB_BASE]);
340 
341 	if (s->tsize > 0) {
342 		if (!tb[TCA_STAB_DATA])
343 			return ERR_PTR(-EINVAL);
344 		tab = nla_data(tb[TCA_STAB_DATA]);
345 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
346 	}
347 
348 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
349 		return ERR_PTR(-EINVAL);
350 
351 	spin_lock(&qdisc_stab_lock);
352 
353 	list_for_each_entry(stab, &qdisc_stab_list, list) {
354 		if (memcmp(&stab->szopts, s, sizeof(*s)))
355 			continue;
356 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
357 			continue;
358 		stab->refcnt++;
359 		spin_unlock(&qdisc_stab_lock);
360 		return stab;
361 	}
362 
363 	spin_unlock(&qdisc_stab_lock);
364 
365 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
366 	if (!stab)
367 		return ERR_PTR(-ENOMEM);
368 
369 	stab->refcnt = 1;
370 	stab->szopts = *s;
371 	if (tsize > 0)
372 		memcpy(stab->data, tab, tsize * sizeof(u16));
373 
374 	spin_lock(&qdisc_stab_lock);
375 	list_add_tail(&stab->list, &qdisc_stab_list);
376 	spin_unlock(&qdisc_stab_lock);
377 
378 	return stab;
379 }
380 
381 void qdisc_put_stab(struct qdisc_size_table *tab)
382 {
383 	if (!tab)
384 		return;
385 
386 	spin_lock(&qdisc_stab_lock);
387 
388 	if (--tab->refcnt == 0) {
389 		list_del(&tab->list);
390 		kfree(tab);
391 	}
392 
393 	spin_unlock(&qdisc_stab_lock);
394 }
395 EXPORT_SYMBOL(qdisc_put_stab);
396 
397 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
398 {
399 	struct nlattr *nest;
400 
401 	nest = nla_nest_start(skb, TCA_STAB);
402 	if (nest == NULL)
403 		goto nla_put_failure;
404 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
405 	nla_nest_end(skb, nest);
406 
407 	return skb->len;
408 
409 nla_put_failure:
410 	return -1;
411 }
412 
413 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
414 {
415 	int pkt_len, slot;
416 
417 	pkt_len = skb->len + stab->szopts.overhead;
418 	if (unlikely(!stab->szopts.tsize))
419 		goto out;
420 
421 	slot = pkt_len + stab->szopts.cell_align;
422 	if (unlikely(slot < 0))
423 		slot = 0;
424 
425 	slot >>= stab->szopts.cell_log;
426 	if (likely(slot < stab->szopts.tsize))
427 		pkt_len = stab->data[slot];
428 	else
429 		pkt_len = stab->data[stab->szopts.tsize - 1] *
430 				(slot / stab->szopts.tsize) +
431 				stab->data[slot % stab->szopts.tsize];
432 
433 	pkt_len <<= stab->szopts.size_log;
434 out:
435 	if (unlikely(pkt_len < 1))
436 		pkt_len = 1;
437 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
438 }
439 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
440 
441 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
442 {
443 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
444 		printk(KERN_WARNING
445 		       "%s: %s qdisc %X: is non-work-conserving?\n",
446 		       txt, qdisc->ops->id, qdisc->handle >> 16);
447 		qdisc->flags |= TCQ_F_WARN_NONWC;
448 	}
449 }
450 EXPORT_SYMBOL(qdisc_warn_nonwc);
451 
452 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
453 {
454 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
455 						 timer);
456 
457 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
458 	__netif_schedule(qdisc_root(wd->qdisc));
459 
460 	return HRTIMER_NORESTART;
461 }
462 
463 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
464 {
465 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
466 	wd->timer.function = qdisc_watchdog;
467 	wd->qdisc = qdisc;
468 }
469 EXPORT_SYMBOL(qdisc_watchdog_init);
470 
471 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
472 {
473 	ktime_t time;
474 
475 	if (test_bit(__QDISC_STATE_DEACTIVATED,
476 		     &qdisc_root_sleeping(wd->qdisc)->state))
477 		return;
478 
479 	wd->qdisc->flags |= TCQ_F_THROTTLED;
480 	time = ktime_set(0, 0);
481 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
482 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
483 }
484 EXPORT_SYMBOL(qdisc_watchdog_schedule);
485 
486 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
487 {
488 	hrtimer_cancel(&wd->timer);
489 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
490 }
491 EXPORT_SYMBOL(qdisc_watchdog_cancel);
492 
493 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
494 {
495 	unsigned int size = n * sizeof(struct hlist_head), i;
496 	struct hlist_head *h;
497 
498 	if (size <= PAGE_SIZE)
499 		h = kmalloc(size, GFP_KERNEL);
500 	else
501 		h = (struct hlist_head *)
502 			__get_free_pages(GFP_KERNEL, get_order(size));
503 
504 	if (h != NULL) {
505 		for (i = 0; i < n; i++)
506 			INIT_HLIST_HEAD(&h[i]);
507 	}
508 	return h;
509 }
510 
511 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
512 {
513 	unsigned int size = n * sizeof(struct hlist_head);
514 
515 	if (size <= PAGE_SIZE)
516 		kfree(h);
517 	else
518 		free_pages((unsigned long)h, get_order(size));
519 }
520 
521 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
522 {
523 	struct Qdisc_class_common *cl;
524 	struct hlist_node *n, *next;
525 	struct hlist_head *nhash, *ohash;
526 	unsigned int nsize, nmask, osize;
527 	unsigned int i, h;
528 
529 	/* Rehash when load factor exceeds 0.75 */
530 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
531 		return;
532 	nsize = clhash->hashsize * 2;
533 	nmask = nsize - 1;
534 	nhash = qdisc_class_hash_alloc(nsize);
535 	if (nhash == NULL)
536 		return;
537 
538 	ohash = clhash->hash;
539 	osize = clhash->hashsize;
540 
541 	sch_tree_lock(sch);
542 	for (i = 0; i < osize; i++) {
543 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
544 			h = qdisc_class_hash(cl->classid, nmask);
545 			hlist_add_head(&cl->hnode, &nhash[h]);
546 		}
547 	}
548 	clhash->hash     = nhash;
549 	clhash->hashsize = nsize;
550 	clhash->hashmask = nmask;
551 	sch_tree_unlock(sch);
552 
553 	qdisc_class_hash_free(ohash, osize);
554 }
555 EXPORT_SYMBOL(qdisc_class_hash_grow);
556 
557 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
558 {
559 	unsigned int size = 4;
560 
561 	clhash->hash = qdisc_class_hash_alloc(size);
562 	if (clhash->hash == NULL)
563 		return -ENOMEM;
564 	clhash->hashsize  = size;
565 	clhash->hashmask  = size - 1;
566 	clhash->hashelems = 0;
567 	return 0;
568 }
569 EXPORT_SYMBOL(qdisc_class_hash_init);
570 
571 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
572 {
573 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_destroy);
576 
577 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
578 			     struct Qdisc_class_common *cl)
579 {
580 	unsigned int h;
581 
582 	INIT_HLIST_NODE(&cl->hnode);
583 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
584 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
585 	clhash->hashelems++;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_insert);
588 
589 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
590 			     struct Qdisc_class_common *cl)
591 {
592 	hlist_del(&cl->hnode);
593 	clhash->hashelems--;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_remove);
596 
597 /* Allocate an unique handle from space managed by kernel */
598 
599 static u32 qdisc_alloc_handle(struct net_device *dev)
600 {
601 	int i = 0x10000;
602 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
603 
604 	do {
605 		autohandle += TC_H_MAKE(0x10000U, 0);
606 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
607 			autohandle = TC_H_MAKE(0x80000000U, 0);
608 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
609 
610 	return i>0 ? autohandle : 0;
611 }
612 
613 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
614 {
615 	const struct Qdisc_class_ops *cops;
616 	unsigned long cl;
617 	u32 parentid;
618 
619 	if (n == 0)
620 		return;
621 	while ((parentid = sch->parent)) {
622 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
623 			return;
624 
625 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
626 		if (sch == NULL) {
627 			WARN_ON(parentid != TC_H_ROOT);
628 			return;
629 		}
630 		cops = sch->ops->cl_ops;
631 		if (cops->qlen_notify) {
632 			cl = cops->get(sch, parentid);
633 			cops->qlen_notify(sch, cl);
634 			cops->put(sch, cl);
635 		}
636 		sch->q.qlen -= n;
637 	}
638 }
639 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
640 
641 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
642 			       struct Qdisc *old, struct Qdisc *new)
643 {
644 	if (new || old)
645 		qdisc_notify(skb, n, clid, old, new);
646 
647 	if (old)
648 		qdisc_destroy(old);
649 }
650 
651 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
652  * to device "dev".
653  *
654  * When appropriate send a netlink notification using 'skb'
655  * and "n".
656  *
657  * On success, destroy old qdisc.
658  */
659 
660 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
661 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
662 		       struct Qdisc *new, struct Qdisc *old)
663 {
664 	struct Qdisc *q = old;
665 	int err = 0;
666 
667 	if (parent == NULL) {
668 		unsigned int i, num_q, ingress;
669 
670 		ingress = 0;
671 		num_q = dev->num_tx_queues;
672 		if ((q && q->flags & TCQ_F_INGRESS) ||
673 		    (new && new->flags & TCQ_F_INGRESS)) {
674 			num_q = 1;
675 			ingress = 1;
676 		}
677 
678 		if (dev->flags & IFF_UP)
679 			dev_deactivate(dev);
680 
681 		if (new && new->ops->attach) {
682 			new->ops->attach(new);
683 			num_q = 0;
684 		}
685 
686 		for (i = 0; i < num_q; i++) {
687 			struct netdev_queue *dev_queue = &dev->rx_queue;
688 
689 			if (!ingress)
690 				dev_queue = netdev_get_tx_queue(dev, i);
691 
692 			old = dev_graft_qdisc(dev_queue, new);
693 			if (new && i > 0)
694 				atomic_inc(&new->refcnt);
695 
696 			if (!ingress)
697 				qdisc_destroy(old);
698 		}
699 
700 		if (!ingress) {
701 			notify_and_destroy(skb, n, classid, dev->qdisc, new);
702 			if (new && !new->ops->attach)
703 				atomic_inc(&new->refcnt);
704 			dev->qdisc = new ? : &noop_qdisc;
705 		} else {
706 			notify_and_destroy(skb, n, classid, old, new);
707 		}
708 
709 		if (dev->flags & IFF_UP)
710 			dev_activate(dev);
711 	} else {
712 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
713 
714 		err = -EOPNOTSUPP;
715 		if (cops && cops->graft) {
716 			unsigned long cl = cops->get(parent, classid);
717 			if (cl) {
718 				err = cops->graft(parent, cl, new, &old);
719 				cops->put(parent, cl);
720 			} else
721 				err = -ENOENT;
722 		}
723 		if (!err)
724 			notify_and_destroy(skb, n, classid, old, new);
725 	}
726 	return err;
727 }
728 
729 /* lockdep annotation is needed for ingress; egress gets it only for name */
730 static struct lock_class_key qdisc_tx_lock;
731 static struct lock_class_key qdisc_rx_lock;
732 
733 /*
734    Allocate and initialize new qdisc.
735 
736    Parameters are passed via opt.
737  */
738 
739 static struct Qdisc *
740 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
741 	     struct Qdisc *p, u32 parent, u32 handle,
742 	     struct nlattr **tca, int *errp)
743 {
744 	int err;
745 	struct nlattr *kind = tca[TCA_KIND];
746 	struct Qdisc *sch;
747 	struct Qdisc_ops *ops;
748 	struct qdisc_size_table *stab;
749 
750 	ops = qdisc_lookup_ops(kind);
751 #ifdef CONFIG_MODULES
752 	if (ops == NULL && kind != NULL) {
753 		char name[IFNAMSIZ];
754 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
755 			/* We dropped the RTNL semaphore in order to
756 			 * perform the module load.  So, even if we
757 			 * succeeded in loading the module we have to
758 			 * tell the caller to replay the request.  We
759 			 * indicate this using -EAGAIN.
760 			 * We replay the request because the device may
761 			 * go away in the mean time.
762 			 */
763 			rtnl_unlock();
764 			request_module("sch_%s", name);
765 			rtnl_lock();
766 			ops = qdisc_lookup_ops(kind);
767 			if (ops != NULL) {
768 				/* We will try again qdisc_lookup_ops,
769 				 * so don't keep a reference.
770 				 */
771 				module_put(ops->owner);
772 				err = -EAGAIN;
773 				goto err_out;
774 			}
775 		}
776 	}
777 #endif
778 
779 	err = -ENOENT;
780 	if (ops == NULL)
781 		goto err_out;
782 
783 	sch = qdisc_alloc(dev_queue, ops);
784 	if (IS_ERR(sch)) {
785 		err = PTR_ERR(sch);
786 		goto err_out2;
787 	}
788 
789 	sch->parent = parent;
790 
791 	if (handle == TC_H_INGRESS) {
792 		sch->flags |= TCQ_F_INGRESS;
793 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
794 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
795 	} else {
796 		if (handle == 0) {
797 			handle = qdisc_alloc_handle(dev);
798 			err = -ENOMEM;
799 			if (handle == 0)
800 				goto err_out3;
801 		}
802 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
803 	}
804 
805 	sch->handle = handle;
806 
807 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
808 		if (tca[TCA_STAB]) {
809 			stab = qdisc_get_stab(tca[TCA_STAB]);
810 			if (IS_ERR(stab)) {
811 				err = PTR_ERR(stab);
812 				goto err_out4;
813 			}
814 			sch->stab = stab;
815 		}
816 		if (tca[TCA_RATE]) {
817 			spinlock_t *root_lock;
818 
819 			err = -EOPNOTSUPP;
820 			if (sch->flags & TCQ_F_MQROOT)
821 				goto err_out4;
822 
823 			if ((sch->parent != TC_H_ROOT) &&
824 			    !(sch->flags & TCQ_F_INGRESS) &&
825 			    (!p || !(p->flags & TCQ_F_MQROOT)))
826 				root_lock = qdisc_root_sleeping_lock(sch);
827 			else
828 				root_lock = qdisc_lock(sch);
829 
830 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
831 						root_lock, tca[TCA_RATE]);
832 			if (err)
833 				goto err_out4;
834 		}
835 
836 		qdisc_list_add(sch);
837 
838 		return sch;
839 	}
840 err_out3:
841 	dev_put(dev);
842 	kfree((char *) sch - sch->padded);
843 err_out2:
844 	module_put(ops->owner);
845 err_out:
846 	*errp = err;
847 	return NULL;
848 
849 err_out4:
850 	/*
851 	 * Any broken qdiscs that would require a ops->reset() here?
852 	 * The qdisc was never in action so it shouldn't be necessary.
853 	 */
854 	qdisc_put_stab(sch->stab);
855 	if (ops->destroy)
856 		ops->destroy(sch);
857 	goto err_out3;
858 }
859 
860 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
861 {
862 	struct qdisc_size_table *stab = NULL;
863 	int err = 0;
864 
865 	if (tca[TCA_OPTIONS]) {
866 		if (sch->ops->change == NULL)
867 			return -EINVAL;
868 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
869 		if (err)
870 			return err;
871 	}
872 
873 	if (tca[TCA_STAB]) {
874 		stab = qdisc_get_stab(tca[TCA_STAB]);
875 		if (IS_ERR(stab))
876 			return PTR_ERR(stab);
877 	}
878 
879 	qdisc_put_stab(sch->stab);
880 	sch->stab = stab;
881 
882 	if (tca[TCA_RATE]) {
883 		/* NB: ignores errors from replace_estimator
884 		   because change can't be undone. */
885 		if (sch->flags & TCQ_F_MQROOT)
886 			goto out;
887 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
888 					    qdisc_root_sleeping_lock(sch),
889 					    tca[TCA_RATE]);
890 	}
891 out:
892 	return 0;
893 }
894 
895 struct check_loop_arg
896 {
897 	struct qdisc_walker 	w;
898 	struct Qdisc		*p;
899 	int			depth;
900 };
901 
902 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
903 
904 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
905 {
906 	struct check_loop_arg	arg;
907 
908 	if (q->ops->cl_ops == NULL)
909 		return 0;
910 
911 	arg.w.stop = arg.w.skip = arg.w.count = 0;
912 	arg.w.fn = check_loop_fn;
913 	arg.depth = depth;
914 	arg.p = p;
915 	q->ops->cl_ops->walk(q, &arg.w);
916 	return arg.w.stop ? -ELOOP : 0;
917 }
918 
919 static int
920 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
921 {
922 	struct Qdisc *leaf;
923 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
924 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
925 
926 	leaf = cops->leaf(q, cl);
927 	if (leaf) {
928 		if (leaf == arg->p || arg->depth > 7)
929 			return -ELOOP;
930 		return check_loop(leaf, arg->p, arg->depth + 1);
931 	}
932 	return 0;
933 }
934 
935 /*
936  * Delete/get qdisc.
937  */
938 
939 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
940 {
941 	struct net *net = sock_net(skb->sk);
942 	struct tcmsg *tcm = NLMSG_DATA(n);
943 	struct nlattr *tca[TCA_MAX + 1];
944 	struct net_device *dev;
945 	u32 clid = tcm->tcm_parent;
946 	struct Qdisc *q = NULL;
947 	struct Qdisc *p = NULL;
948 	int err;
949 
950 	if (net != &init_net)
951 		return -EINVAL;
952 
953 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
954 		return -ENODEV;
955 
956 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
957 	if (err < 0)
958 		return err;
959 
960 	if (clid) {
961 		if (clid != TC_H_ROOT) {
962 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
963 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
964 					return -ENOENT;
965 				q = qdisc_leaf(p, clid);
966 			} else { /* ingress */
967 				q = dev->rx_queue.qdisc_sleeping;
968 			}
969 		} else {
970 			q = dev->qdisc;
971 		}
972 		if (!q)
973 			return -ENOENT;
974 
975 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
976 			return -EINVAL;
977 	} else {
978 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
979 			return -ENOENT;
980 	}
981 
982 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
983 		return -EINVAL;
984 
985 	if (n->nlmsg_type == RTM_DELQDISC) {
986 		if (!clid)
987 			return -EINVAL;
988 		if (q->handle == 0)
989 			return -ENOENT;
990 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
991 			return err;
992 	} else {
993 		qdisc_notify(skb, n, clid, NULL, q);
994 	}
995 	return 0;
996 }
997 
998 /*
999    Create/change qdisc.
1000  */
1001 
1002 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1003 {
1004 	struct net *net = sock_net(skb->sk);
1005 	struct tcmsg *tcm;
1006 	struct nlattr *tca[TCA_MAX + 1];
1007 	struct net_device *dev;
1008 	u32 clid;
1009 	struct Qdisc *q, *p;
1010 	int err;
1011 
1012 	if (net != &init_net)
1013 		return -EINVAL;
1014 
1015 replay:
1016 	/* Reinit, just in case something touches this. */
1017 	tcm = NLMSG_DATA(n);
1018 	clid = tcm->tcm_parent;
1019 	q = p = NULL;
1020 
1021 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1022 		return -ENODEV;
1023 
1024 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1025 	if (err < 0)
1026 		return err;
1027 
1028 	if (clid) {
1029 		if (clid != TC_H_ROOT) {
1030 			if (clid != TC_H_INGRESS) {
1031 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1032 					return -ENOENT;
1033 				q = qdisc_leaf(p, clid);
1034 			} else { /*ingress */
1035 				q = dev->rx_queue.qdisc_sleeping;
1036 			}
1037 		} else {
1038 			q = dev->qdisc;
1039 		}
1040 
1041 		/* It may be default qdisc, ignore it */
1042 		if (q && q->handle == 0)
1043 			q = NULL;
1044 
1045 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046 			if (tcm->tcm_handle) {
1047 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048 					return -EEXIST;
1049 				if (TC_H_MIN(tcm->tcm_handle))
1050 					return -EINVAL;
1051 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052 					goto create_n_graft;
1053 				if (n->nlmsg_flags&NLM_F_EXCL)
1054 					return -EEXIST;
1055 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056 					return -EINVAL;
1057 				if (q == p ||
1058 				    (p && check_loop(q, p, 0)))
1059 					return -ELOOP;
1060 				atomic_inc(&q->refcnt);
1061 				goto graft;
1062 			} else {
1063 				if (q == NULL)
1064 					goto create_n_graft;
1065 
1066 				/* This magic test requires explanation.
1067 				 *
1068 				 *   We know, that some child q is already
1069 				 *   attached to this parent and have choice:
1070 				 *   either to change it or to create/graft new one.
1071 				 *
1072 				 *   1. We are allowed to create/graft only
1073 				 *   if CREATE and REPLACE flags are set.
1074 				 *
1075 				 *   2. If EXCL is set, requestor wanted to say,
1076 				 *   that qdisc tcm_handle is not expected
1077 				 *   to exist, so that we choose create/graft too.
1078 				 *
1079 				 *   3. The last case is when no flags are set.
1080 				 *   Alas, it is sort of hole in API, we
1081 				 *   cannot decide what to do unambiguously.
1082 				 *   For now we select create/graft, if
1083 				 *   user gave KIND, which does not match existing.
1084 				 */
1085 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088 				     (tca[TCA_KIND] &&
1089 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090 					goto create_n_graft;
1091 			}
1092 		}
1093 	} else {
1094 		if (!tcm->tcm_handle)
1095 			return -EINVAL;
1096 		q = qdisc_lookup(dev, tcm->tcm_handle);
1097 	}
1098 
1099 	/* Change qdisc parameters */
1100 	if (q == NULL)
1101 		return -ENOENT;
1102 	if (n->nlmsg_flags&NLM_F_EXCL)
1103 		return -EEXIST;
1104 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105 		return -EINVAL;
1106 	err = qdisc_change(q, tca);
1107 	if (err == 0)
1108 		qdisc_notify(skb, n, clid, NULL, q);
1109 	return err;
1110 
1111 create_n_graft:
1112 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1113 		return -ENOENT;
1114 	if (clid == TC_H_INGRESS)
1115 		q = qdisc_create(dev, &dev->rx_queue, p,
1116 				 tcm->tcm_parent, tcm->tcm_parent,
1117 				 tca, &err);
1118 	else {
1119 		struct netdev_queue *dev_queue;
1120 
1121 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1122 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1123 		else if (p)
1124 			dev_queue = p->dev_queue;
1125 		else
1126 			dev_queue = netdev_get_tx_queue(dev, 0);
1127 
1128 		q = qdisc_create(dev, dev_queue, p,
1129 				 tcm->tcm_parent, tcm->tcm_handle,
1130 				 tca, &err);
1131 	}
1132 	if (q == NULL) {
1133 		if (err == -EAGAIN)
1134 			goto replay;
1135 		return err;
1136 	}
1137 
1138 graft:
1139 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1140 	if (err) {
1141 		if (q)
1142 			qdisc_destroy(q);
1143 		return err;
1144 	}
1145 
1146 	return 0;
1147 }
1148 
1149 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1150 			 u32 pid, u32 seq, u16 flags, int event)
1151 {
1152 	struct tcmsg *tcm;
1153 	struct nlmsghdr  *nlh;
1154 	unsigned char *b = skb_tail_pointer(skb);
1155 	struct gnet_dump d;
1156 
1157 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1158 	tcm = NLMSG_DATA(nlh);
1159 	tcm->tcm_family = AF_UNSPEC;
1160 	tcm->tcm__pad1 = 0;
1161 	tcm->tcm__pad2 = 0;
1162 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1163 	tcm->tcm_parent = clid;
1164 	tcm->tcm_handle = q->handle;
1165 	tcm->tcm_info = atomic_read(&q->refcnt);
1166 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1167 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1168 		goto nla_put_failure;
1169 	q->qstats.qlen = q->q.qlen;
1170 
1171 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1172 		goto nla_put_failure;
1173 
1174 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1175 					 qdisc_root_sleeping_lock(q), &d) < 0)
1176 		goto nla_put_failure;
1177 
1178 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1179 		goto nla_put_failure;
1180 
1181 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1182 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1183 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1184 		goto nla_put_failure;
1185 
1186 	if (gnet_stats_finish_copy(&d) < 0)
1187 		goto nla_put_failure;
1188 
1189 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1190 	return skb->len;
1191 
1192 nlmsg_failure:
1193 nla_put_failure:
1194 	nlmsg_trim(skb, b);
1195 	return -1;
1196 }
1197 
1198 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1199 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1200 {
1201 	struct sk_buff *skb;
1202 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1203 
1204 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1205 	if (!skb)
1206 		return -ENOBUFS;
1207 
1208 	if (old && old->handle) {
1209 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1210 			goto err_out;
1211 	}
1212 	if (new) {
1213 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1214 			goto err_out;
1215 	}
1216 
1217 	if (skb->len)
1218 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1219 
1220 err_out:
1221 	kfree_skb(skb);
1222 	return -EINVAL;
1223 }
1224 
1225 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1226 {
1227 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1228 }
1229 
1230 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1231 			      struct netlink_callback *cb,
1232 			      int *q_idx_p, int s_q_idx)
1233 {
1234 	int ret = 0, q_idx = *q_idx_p;
1235 	struct Qdisc *q;
1236 
1237 	if (!root)
1238 		return 0;
1239 
1240 	q = root;
1241 	if (q_idx < s_q_idx) {
1242 		q_idx++;
1243 	} else {
1244 		if (!tc_qdisc_dump_ignore(q) &&
1245 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1246 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1247 			goto done;
1248 		q_idx++;
1249 	}
1250 	list_for_each_entry(q, &root->list, list) {
1251 		if (q_idx < s_q_idx) {
1252 			q_idx++;
1253 			continue;
1254 		}
1255 		if (!tc_qdisc_dump_ignore(q) &&
1256 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1257 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1258 			goto done;
1259 		q_idx++;
1260 	}
1261 
1262 out:
1263 	*q_idx_p = q_idx;
1264 	return ret;
1265 done:
1266 	ret = -1;
1267 	goto out;
1268 }
1269 
1270 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1271 {
1272 	struct net *net = sock_net(skb->sk);
1273 	int idx, q_idx;
1274 	int s_idx, s_q_idx;
1275 	struct net_device *dev;
1276 
1277 	if (net != &init_net)
1278 		return 0;
1279 
1280 	s_idx = cb->args[0];
1281 	s_q_idx = q_idx = cb->args[1];
1282 	read_lock(&dev_base_lock);
1283 	idx = 0;
1284 	for_each_netdev(&init_net, dev) {
1285 		struct netdev_queue *dev_queue;
1286 
1287 		if (idx < s_idx)
1288 			goto cont;
1289 		if (idx > s_idx)
1290 			s_q_idx = 0;
1291 		q_idx = 0;
1292 
1293 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1294 			goto done;
1295 
1296 		dev_queue = &dev->rx_queue;
1297 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1298 			goto done;
1299 
1300 cont:
1301 		idx++;
1302 	}
1303 
1304 done:
1305 	read_unlock(&dev_base_lock);
1306 
1307 	cb->args[0] = idx;
1308 	cb->args[1] = q_idx;
1309 
1310 	return skb->len;
1311 }
1312 
1313 
1314 
1315 /************************************************
1316  *	Traffic classes manipulation.		*
1317  ************************************************/
1318 
1319 
1320 
1321 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1322 {
1323 	struct net *net = sock_net(skb->sk);
1324 	struct tcmsg *tcm = NLMSG_DATA(n);
1325 	struct nlattr *tca[TCA_MAX + 1];
1326 	struct net_device *dev;
1327 	struct Qdisc *q = NULL;
1328 	const struct Qdisc_class_ops *cops;
1329 	unsigned long cl = 0;
1330 	unsigned long new_cl;
1331 	u32 pid = tcm->tcm_parent;
1332 	u32 clid = tcm->tcm_handle;
1333 	u32 qid = TC_H_MAJ(clid);
1334 	int err;
1335 
1336 	if (net != &init_net)
1337 		return -EINVAL;
1338 
1339 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1340 		return -ENODEV;
1341 
1342 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1343 	if (err < 0)
1344 		return err;
1345 
1346 	/*
1347 	   parent == TC_H_UNSPEC - unspecified parent.
1348 	   parent == TC_H_ROOT   - class is root, which has no parent.
1349 	   parent == X:0	 - parent is root class.
1350 	   parent == X:Y	 - parent is a node in hierarchy.
1351 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1352 
1353 	   handle == 0:0	 - generate handle from kernel pool.
1354 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1355 	   handle == X:Y	 - clear.
1356 	   handle == X:0	 - root class.
1357 	 */
1358 
1359 	/* Step 1. Determine qdisc handle X:0 */
1360 
1361 	if (pid != TC_H_ROOT) {
1362 		u32 qid1 = TC_H_MAJ(pid);
1363 
1364 		if (qid && qid1) {
1365 			/* If both majors are known, they must be identical. */
1366 			if (qid != qid1)
1367 				return -EINVAL;
1368 		} else if (qid1) {
1369 			qid = qid1;
1370 		} else if (qid == 0)
1371 			qid = dev->qdisc->handle;
1372 
1373 		/* Now qid is genuine qdisc handle consistent
1374 		   both with parent and child.
1375 
1376 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1377 		 */
1378 		if (pid)
1379 			pid = TC_H_MAKE(qid, pid);
1380 	} else {
1381 		if (qid == 0)
1382 			qid = dev->qdisc->handle;
1383 	}
1384 
1385 	/* OK. Locate qdisc */
1386 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1387 		return -ENOENT;
1388 
1389 	/* An check that it supports classes */
1390 	cops = q->ops->cl_ops;
1391 	if (cops == NULL)
1392 		return -EINVAL;
1393 
1394 	/* Now try to get class */
1395 	if (clid == 0) {
1396 		if (pid == TC_H_ROOT)
1397 			clid = qid;
1398 	} else
1399 		clid = TC_H_MAKE(qid, clid);
1400 
1401 	if (clid)
1402 		cl = cops->get(q, clid);
1403 
1404 	if (cl == 0) {
1405 		err = -ENOENT;
1406 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1407 			goto out;
1408 	} else {
1409 		switch (n->nlmsg_type) {
1410 		case RTM_NEWTCLASS:
1411 			err = -EEXIST;
1412 			if (n->nlmsg_flags&NLM_F_EXCL)
1413 				goto out;
1414 			break;
1415 		case RTM_DELTCLASS:
1416 			err = -EOPNOTSUPP;
1417 			if (cops->delete)
1418 				err = cops->delete(q, cl);
1419 			if (err == 0)
1420 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1421 			goto out;
1422 		case RTM_GETTCLASS:
1423 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1424 			goto out;
1425 		default:
1426 			err = -EINVAL;
1427 			goto out;
1428 		}
1429 	}
1430 
1431 	new_cl = cl;
1432 	err = -EOPNOTSUPP;
1433 	if (cops->change)
1434 		err = cops->change(q, clid, pid, tca, &new_cl);
1435 	if (err == 0)
1436 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1437 
1438 out:
1439 	if (cl)
1440 		cops->put(q, cl);
1441 
1442 	return err;
1443 }
1444 
1445 
1446 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1447 			  unsigned long cl,
1448 			  u32 pid, u32 seq, u16 flags, int event)
1449 {
1450 	struct tcmsg *tcm;
1451 	struct nlmsghdr  *nlh;
1452 	unsigned char *b = skb_tail_pointer(skb);
1453 	struct gnet_dump d;
1454 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1455 
1456 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1457 	tcm = NLMSG_DATA(nlh);
1458 	tcm->tcm_family = AF_UNSPEC;
1459 	tcm->tcm__pad1 = 0;
1460 	tcm->tcm__pad2 = 0;
1461 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1462 	tcm->tcm_parent = q->handle;
1463 	tcm->tcm_handle = q->handle;
1464 	tcm->tcm_info = 0;
1465 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1466 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1467 		goto nla_put_failure;
1468 
1469 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1470 					 qdisc_root_sleeping_lock(q), &d) < 0)
1471 		goto nla_put_failure;
1472 
1473 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1474 		goto nla_put_failure;
1475 
1476 	if (gnet_stats_finish_copy(&d) < 0)
1477 		goto nla_put_failure;
1478 
1479 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1480 	return skb->len;
1481 
1482 nlmsg_failure:
1483 nla_put_failure:
1484 	nlmsg_trim(skb, b);
1485 	return -1;
1486 }
1487 
1488 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1489 			  struct Qdisc *q, unsigned long cl, int event)
1490 {
1491 	struct sk_buff *skb;
1492 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1493 
1494 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1495 	if (!skb)
1496 		return -ENOBUFS;
1497 
1498 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1499 		kfree_skb(skb);
1500 		return -EINVAL;
1501 	}
1502 
1503 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1504 }
1505 
1506 struct qdisc_dump_args
1507 {
1508 	struct qdisc_walker w;
1509 	struct sk_buff *skb;
1510 	struct netlink_callback *cb;
1511 };
1512 
1513 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1514 {
1515 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1516 
1517 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1518 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1519 }
1520 
1521 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1522 				struct tcmsg *tcm, struct netlink_callback *cb,
1523 				int *t_p, int s_t)
1524 {
1525 	struct qdisc_dump_args arg;
1526 
1527 	if (tc_qdisc_dump_ignore(q) ||
1528 	    *t_p < s_t || !q->ops->cl_ops ||
1529 	    (tcm->tcm_parent &&
1530 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1531 		(*t_p)++;
1532 		return 0;
1533 	}
1534 	if (*t_p > s_t)
1535 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1536 	arg.w.fn = qdisc_class_dump;
1537 	arg.skb = skb;
1538 	arg.cb = cb;
1539 	arg.w.stop  = 0;
1540 	arg.w.skip = cb->args[1];
1541 	arg.w.count = 0;
1542 	q->ops->cl_ops->walk(q, &arg.w);
1543 	cb->args[1] = arg.w.count;
1544 	if (arg.w.stop)
1545 		return -1;
1546 	(*t_p)++;
1547 	return 0;
1548 }
1549 
1550 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1551 			       struct tcmsg *tcm, struct netlink_callback *cb,
1552 			       int *t_p, int s_t)
1553 {
1554 	struct Qdisc *q;
1555 
1556 	if (!root)
1557 		return 0;
1558 
1559 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1560 		return -1;
1561 
1562 	list_for_each_entry(q, &root->list, list) {
1563 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1564 			return -1;
1565 	}
1566 
1567 	return 0;
1568 }
1569 
1570 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1571 {
1572 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1573 	struct net *net = sock_net(skb->sk);
1574 	struct netdev_queue *dev_queue;
1575 	struct net_device *dev;
1576 	int t, s_t;
1577 
1578 	if (net != &init_net)
1579 		return 0;
1580 
1581 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1582 		return 0;
1583 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1584 		return 0;
1585 
1586 	s_t = cb->args[0];
1587 	t = 0;
1588 
1589 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1590 		goto done;
1591 
1592 	dev_queue = &dev->rx_queue;
1593 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1594 		goto done;
1595 
1596 done:
1597 	cb->args[0] = t;
1598 
1599 	dev_put(dev);
1600 	return skb->len;
1601 }
1602 
1603 /* Main classifier routine: scans classifier chain attached
1604    to this qdisc, (optionally) tests for protocol and asks
1605    specific classifiers.
1606  */
1607 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1608 		       struct tcf_result *res)
1609 {
1610 	__be16 protocol = skb->protocol;
1611 	int err = 0;
1612 
1613 	for (; tp; tp = tp->next) {
1614 		if ((tp->protocol == protocol ||
1615 		     tp->protocol == htons(ETH_P_ALL)) &&
1616 		    (err = tp->classify(skb, tp, res)) >= 0) {
1617 #ifdef CONFIG_NET_CLS_ACT
1618 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1619 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1620 #endif
1621 			return err;
1622 		}
1623 	}
1624 	return -1;
1625 }
1626 EXPORT_SYMBOL(tc_classify_compat);
1627 
1628 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1629 		struct tcf_result *res)
1630 {
1631 	int err = 0;
1632 	__be16 protocol;
1633 #ifdef CONFIG_NET_CLS_ACT
1634 	struct tcf_proto *otp = tp;
1635 reclassify:
1636 #endif
1637 	protocol = skb->protocol;
1638 
1639 	err = tc_classify_compat(skb, tp, res);
1640 #ifdef CONFIG_NET_CLS_ACT
1641 	if (err == TC_ACT_RECLASSIFY) {
1642 		u32 verd = G_TC_VERD(skb->tc_verd);
1643 		tp = otp;
1644 
1645 		if (verd++ >= MAX_REC_LOOP) {
1646 			printk("rule prio %u protocol %02x reclassify loop, "
1647 			       "packet dropped\n",
1648 			       tp->prio&0xffff, ntohs(tp->protocol));
1649 			return TC_ACT_SHOT;
1650 		}
1651 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1652 		goto reclassify;
1653 	}
1654 #endif
1655 	return err;
1656 }
1657 EXPORT_SYMBOL(tc_classify);
1658 
1659 void tcf_destroy(struct tcf_proto *tp)
1660 {
1661 	tp->ops->destroy(tp);
1662 	module_put(tp->ops->owner);
1663 	kfree(tp);
1664 }
1665 
1666 void tcf_destroy_chain(struct tcf_proto **fl)
1667 {
1668 	struct tcf_proto *tp;
1669 
1670 	while ((tp = *fl) != NULL) {
1671 		*fl = tp->next;
1672 		tcf_destroy(tp);
1673 	}
1674 }
1675 EXPORT_SYMBOL(tcf_destroy_chain);
1676 
1677 #ifdef CONFIG_PROC_FS
1678 static int psched_show(struct seq_file *seq, void *v)
1679 {
1680 	struct timespec ts;
1681 
1682 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1683 	seq_printf(seq, "%08x %08x %08x %08x\n",
1684 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1685 		   1000000,
1686 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1687 
1688 	return 0;
1689 }
1690 
1691 static int psched_open(struct inode *inode, struct file *file)
1692 {
1693 	return single_open(file, psched_show, PDE(inode)->data);
1694 }
1695 
1696 static const struct file_operations psched_fops = {
1697 	.owner = THIS_MODULE,
1698 	.open = psched_open,
1699 	.read  = seq_read,
1700 	.llseek = seq_lseek,
1701 	.release = single_release,
1702 };
1703 #endif
1704 
1705 static int __init pktsched_init(void)
1706 {
1707 	register_qdisc(&pfifo_qdisc_ops);
1708 	register_qdisc(&bfifo_qdisc_ops);
1709 	register_qdisc(&mq_qdisc_ops);
1710 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1711 
1712 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1713 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1714 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1715 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1716 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1717 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1718 
1719 	return 0;
1720 }
1721 
1722 subsys_initcall(pktsched_init);
1723