xref: /openbmc/linux/net/sched/sch_api.c (revision 6ec1c69a8f6492fd25722f4762721921da074c12)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
211 }
212 
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	struct Qdisc *q;
223 
224 	q = qdisc_match_from_root(dev->qdisc, handle);
225 	if (q)
226 		goto out;
227 
228 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
229 out:
230 	return q;
231 }
232 
233 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
234 {
235 	unsigned long cl;
236 	struct Qdisc *leaf;
237 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
238 
239 	if (cops == NULL)
240 		return NULL;
241 	cl = cops->get(p, classid);
242 
243 	if (cl == 0)
244 		return NULL;
245 	leaf = cops->leaf(p, cl);
246 	cops->put(p, cl);
247 	return leaf;
248 }
249 
250 /* Find queueing discipline by name */
251 
252 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
253 {
254 	struct Qdisc_ops *q = NULL;
255 
256 	if (kind) {
257 		read_lock(&qdisc_mod_lock);
258 		for (q = qdisc_base; q; q = q->next) {
259 			if (nla_strcmp(kind, q->id) == 0) {
260 				if (!try_module_get(q->owner))
261 					q = NULL;
262 				break;
263 			}
264 		}
265 		read_unlock(&qdisc_mod_lock);
266 	}
267 	return q;
268 }
269 
270 static struct qdisc_rate_table *qdisc_rtab_list;
271 
272 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
273 {
274 	struct qdisc_rate_table *rtab;
275 
276 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
277 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
278 			rtab->refcnt++;
279 			return rtab;
280 		}
281 	}
282 
283 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
284 	    nla_len(tab) != TC_RTAB_SIZE)
285 		return NULL;
286 
287 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
288 	if (rtab) {
289 		rtab->rate = *r;
290 		rtab->refcnt = 1;
291 		memcpy(rtab->data, nla_data(tab), 1024);
292 		rtab->next = qdisc_rtab_list;
293 		qdisc_rtab_list = rtab;
294 	}
295 	return rtab;
296 }
297 EXPORT_SYMBOL(qdisc_get_rtab);
298 
299 void qdisc_put_rtab(struct qdisc_rate_table *tab)
300 {
301 	struct qdisc_rate_table *rtab, **rtabp;
302 
303 	if (!tab || --tab->refcnt)
304 		return;
305 
306 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
307 		if (rtab == tab) {
308 			*rtabp = rtab->next;
309 			kfree(rtab);
310 			return;
311 		}
312 	}
313 }
314 EXPORT_SYMBOL(qdisc_put_rtab);
315 
316 static LIST_HEAD(qdisc_stab_list);
317 static DEFINE_SPINLOCK(qdisc_stab_lock);
318 
319 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
320 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
321 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
322 };
323 
324 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
325 {
326 	struct nlattr *tb[TCA_STAB_MAX + 1];
327 	struct qdisc_size_table *stab;
328 	struct tc_sizespec *s;
329 	unsigned int tsize = 0;
330 	u16 *tab = NULL;
331 	int err;
332 
333 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
334 	if (err < 0)
335 		return ERR_PTR(err);
336 	if (!tb[TCA_STAB_BASE])
337 		return ERR_PTR(-EINVAL);
338 
339 	s = nla_data(tb[TCA_STAB_BASE]);
340 
341 	if (s->tsize > 0) {
342 		if (!tb[TCA_STAB_DATA])
343 			return ERR_PTR(-EINVAL);
344 		tab = nla_data(tb[TCA_STAB_DATA]);
345 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
346 	}
347 
348 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
349 		return ERR_PTR(-EINVAL);
350 
351 	spin_lock(&qdisc_stab_lock);
352 
353 	list_for_each_entry(stab, &qdisc_stab_list, list) {
354 		if (memcmp(&stab->szopts, s, sizeof(*s)))
355 			continue;
356 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
357 			continue;
358 		stab->refcnt++;
359 		spin_unlock(&qdisc_stab_lock);
360 		return stab;
361 	}
362 
363 	spin_unlock(&qdisc_stab_lock);
364 
365 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
366 	if (!stab)
367 		return ERR_PTR(-ENOMEM);
368 
369 	stab->refcnt = 1;
370 	stab->szopts = *s;
371 	if (tsize > 0)
372 		memcpy(stab->data, tab, tsize * sizeof(u16));
373 
374 	spin_lock(&qdisc_stab_lock);
375 	list_add_tail(&stab->list, &qdisc_stab_list);
376 	spin_unlock(&qdisc_stab_lock);
377 
378 	return stab;
379 }
380 
381 void qdisc_put_stab(struct qdisc_size_table *tab)
382 {
383 	if (!tab)
384 		return;
385 
386 	spin_lock(&qdisc_stab_lock);
387 
388 	if (--tab->refcnt == 0) {
389 		list_del(&tab->list);
390 		kfree(tab);
391 	}
392 
393 	spin_unlock(&qdisc_stab_lock);
394 }
395 EXPORT_SYMBOL(qdisc_put_stab);
396 
397 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
398 {
399 	struct nlattr *nest;
400 
401 	nest = nla_nest_start(skb, TCA_STAB);
402 	if (nest == NULL)
403 		goto nla_put_failure;
404 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
405 	nla_nest_end(skb, nest);
406 
407 	return skb->len;
408 
409 nla_put_failure:
410 	return -1;
411 }
412 
413 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
414 {
415 	int pkt_len, slot;
416 
417 	pkt_len = skb->len + stab->szopts.overhead;
418 	if (unlikely(!stab->szopts.tsize))
419 		goto out;
420 
421 	slot = pkt_len + stab->szopts.cell_align;
422 	if (unlikely(slot < 0))
423 		slot = 0;
424 
425 	slot >>= stab->szopts.cell_log;
426 	if (likely(slot < stab->szopts.tsize))
427 		pkt_len = stab->data[slot];
428 	else
429 		pkt_len = stab->data[stab->szopts.tsize - 1] *
430 				(slot / stab->szopts.tsize) +
431 				stab->data[slot % stab->szopts.tsize];
432 
433 	pkt_len <<= stab->szopts.size_log;
434 out:
435 	if (unlikely(pkt_len < 1))
436 		pkt_len = 1;
437 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
438 }
439 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
440 
441 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
442 {
443 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
444 		printk(KERN_WARNING
445 		       "%s: %s qdisc %X: is non-work-conserving?\n",
446 		       txt, qdisc->ops->id, qdisc->handle >> 16);
447 		qdisc->flags |= TCQ_F_WARN_NONWC;
448 	}
449 }
450 EXPORT_SYMBOL(qdisc_warn_nonwc);
451 
452 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
453 {
454 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
455 						 timer);
456 
457 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
458 	__netif_schedule(qdisc_root(wd->qdisc));
459 
460 	return HRTIMER_NORESTART;
461 }
462 
463 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
464 {
465 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
466 	wd->timer.function = qdisc_watchdog;
467 	wd->qdisc = qdisc;
468 }
469 EXPORT_SYMBOL(qdisc_watchdog_init);
470 
471 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
472 {
473 	ktime_t time;
474 
475 	if (test_bit(__QDISC_STATE_DEACTIVATED,
476 		     &qdisc_root_sleeping(wd->qdisc)->state))
477 		return;
478 
479 	wd->qdisc->flags |= TCQ_F_THROTTLED;
480 	time = ktime_set(0, 0);
481 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
482 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
483 }
484 EXPORT_SYMBOL(qdisc_watchdog_schedule);
485 
486 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
487 {
488 	hrtimer_cancel(&wd->timer);
489 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
490 }
491 EXPORT_SYMBOL(qdisc_watchdog_cancel);
492 
493 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
494 {
495 	unsigned int size = n * sizeof(struct hlist_head), i;
496 	struct hlist_head *h;
497 
498 	if (size <= PAGE_SIZE)
499 		h = kmalloc(size, GFP_KERNEL);
500 	else
501 		h = (struct hlist_head *)
502 			__get_free_pages(GFP_KERNEL, get_order(size));
503 
504 	if (h != NULL) {
505 		for (i = 0; i < n; i++)
506 			INIT_HLIST_HEAD(&h[i]);
507 	}
508 	return h;
509 }
510 
511 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
512 {
513 	unsigned int size = n * sizeof(struct hlist_head);
514 
515 	if (size <= PAGE_SIZE)
516 		kfree(h);
517 	else
518 		free_pages((unsigned long)h, get_order(size));
519 }
520 
521 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
522 {
523 	struct Qdisc_class_common *cl;
524 	struct hlist_node *n, *next;
525 	struct hlist_head *nhash, *ohash;
526 	unsigned int nsize, nmask, osize;
527 	unsigned int i, h;
528 
529 	/* Rehash when load factor exceeds 0.75 */
530 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
531 		return;
532 	nsize = clhash->hashsize * 2;
533 	nmask = nsize - 1;
534 	nhash = qdisc_class_hash_alloc(nsize);
535 	if (nhash == NULL)
536 		return;
537 
538 	ohash = clhash->hash;
539 	osize = clhash->hashsize;
540 
541 	sch_tree_lock(sch);
542 	for (i = 0; i < osize; i++) {
543 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
544 			h = qdisc_class_hash(cl->classid, nmask);
545 			hlist_add_head(&cl->hnode, &nhash[h]);
546 		}
547 	}
548 	clhash->hash     = nhash;
549 	clhash->hashsize = nsize;
550 	clhash->hashmask = nmask;
551 	sch_tree_unlock(sch);
552 
553 	qdisc_class_hash_free(ohash, osize);
554 }
555 EXPORT_SYMBOL(qdisc_class_hash_grow);
556 
557 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
558 {
559 	unsigned int size = 4;
560 
561 	clhash->hash = qdisc_class_hash_alloc(size);
562 	if (clhash->hash == NULL)
563 		return -ENOMEM;
564 	clhash->hashsize  = size;
565 	clhash->hashmask  = size - 1;
566 	clhash->hashelems = 0;
567 	return 0;
568 }
569 EXPORT_SYMBOL(qdisc_class_hash_init);
570 
571 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
572 {
573 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_destroy);
576 
577 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
578 			     struct Qdisc_class_common *cl)
579 {
580 	unsigned int h;
581 
582 	INIT_HLIST_NODE(&cl->hnode);
583 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
584 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
585 	clhash->hashelems++;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_insert);
588 
589 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
590 			     struct Qdisc_class_common *cl)
591 {
592 	hlist_del(&cl->hnode);
593 	clhash->hashelems--;
594 }
595 EXPORT_SYMBOL(qdisc_class_hash_remove);
596 
597 /* Allocate an unique handle from space managed by kernel */
598 
599 static u32 qdisc_alloc_handle(struct net_device *dev)
600 {
601 	int i = 0x10000;
602 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
603 
604 	do {
605 		autohandle += TC_H_MAKE(0x10000U, 0);
606 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
607 			autohandle = TC_H_MAKE(0x80000000U, 0);
608 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
609 
610 	return i>0 ? autohandle : 0;
611 }
612 
613 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
614 {
615 	const struct Qdisc_class_ops *cops;
616 	unsigned long cl;
617 	u32 parentid;
618 
619 	if (n == 0)
620 		return;
621 	while ((parentid = sch->parent)) {
622 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
623 			return;
624 
625 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
626 		if (sch == NULL) {
627 			WARN_ON(parentid != TC_H_ROOT);
628 			return;
629 		}
630 		cops = sch->ops->cl_ops;
631 		if (cops->qlen_notify) {
632 			cl = cops->get(sch, parentid);
633 			cops->qlen_notify(sch, cl);
634 			cops->put(sch, cl);
635 		}
636 		sch->q.qlen -= n;
637 	}
638 }
639 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
640 
641 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
642 			       struct Qdisc *old, struct Qdisc *new)
643 {
644 	if (new || old)
645 		qdisc_notify(skb, n, clid, old, new);
646 
647 	if (old)
648 		qdisc_destroy(old);
649 }
650 
651 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
652  * to device "dev".
653  *
654  * When appropriate send a netlink notification using 'skb'
655  * and "n".
656  *
657  * On success, destroy old qdisc.
658  */
659 
660 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
661 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
662 		       struct Qdisc *new, struct Qdisc *old)
663 {
664 	struct Qdisc *q = old;
665 	int err = 0;
666 
667 	if (parent == NULL) {
668 		unsigned int i, num_q, ingress;
669 
670 		ingress = 0;
671 		num_q = dev->num_tx_queues;
672 		if ((q && q->flags & TCQ_F_INGRESS) ||
673 		    (new && new->flags & TCQ_F_INGRESS)) {
674 			num_q = 1;
675 			ingress = 1;
676 		}
677 
678 		if (dev->flags & IFF_UP)
679 			dev_deactivate(dev);
680 
681 		if (new && new->ops->attach) {
682 			new->ops->attach(new);
683 			num_q = 0;
684 		}
685 
686 		for (i = 0; i < num_q; i++) {
687 			struct netdev_queue *dev_queue = &dev->rx_queue;
688 
689 			if (!ingress)
690 				dev_queue = netdev_get_tx_queue(dev, i);
691 
692 			old = dev_graft_qdisc(dev_queue, new);
693 			if (new && i > 0)
694 				atomic_inc(&new->refcnt);
695 
696 			qdisc_destroy(old);
697 		}
698 
699 		notify_and_destroy(skb, n, classid, dev->qdisc, new);
700 		if (new && !new->ops->attach)
701 			atomic_inc(&new->refcnt);
702 		dev->qdisc = new ? : &noop_qdisc;
703 
704 		if (dev->flags & IFF_UP)
705 			dev_activate(dev);
706 	} else {
707 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
708 
709 		err = -EOPNOTSUPP;
710 		if (cops && cops->graft) {
711 			unsigned long cl = cops->get(parent, classid);
712 			if (cl) {
713 				err = cops->graft(parent, cl, new, &old);
714 				cops->put(parent, cl);
715 			} else
716 				err = -ENOENT;
717 		}
718 		if (!err)
719 			notify_and_destroy(skb, n, classid, old, new);
720 	}
721 	return err;
722 }
723 
724 /* lockdep annotation is needed for ingress; egress gets it only for name */
725 static struct lock_class_key qdisc_tx_lock;
726 static struct lock_class_key qdisc_rx_lock;
727 
728 /*
729    Allocate and initialize new qdisc.
730 
731    Parameters are passed via opt.
732  */
733 
734 static struct Qdisc *
735 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
736 	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
737 {
738 	int err;
739 	struct nlattr *kind = tca[TCA_KIND];
740 	struct Qdisc *sch;
741 	struct Qdisc_ops *ops;
742 	struct qdisc_size_table *stab;
743 
744 	ops = qdisc_lookup_ops(kind);
745 #ifdef CONFIG_MODULES
746 	if (ops == NULL && kind != NULL) {
747 		char name[IFNAMSIZ];
748 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
749 			/* We dropped the RTNL semaphore in order to
750 			 * perform the module load.  So, even if we
751 			 * succeeded in loading the module we have to
752 			 * tell the caller to replay the request.  We
753 			 * indicate this using -EAGAIN.
754 			 * We replay the request because the device may
755 			 * go away in the mean time.
756 			 */
757 			rtnl_unlock();
758 			request_module("sch_%s", name);
759 			rtnl_lock();
760 			ops = qdisc_lookup_ops(kind);
761 			if (ops != NULL) {
762 				/* We will try again qdisc_lookup_ops,
763 				 * so don't keep a reference.
764 				 */
765 				module_put(ops->owner);
766 				err = -EAGAIN;
767 				goto err_out;
768 			}
769 		}
770 	}
771 #endif
772 
773 	err = -ENOENT;
774 	if (ops == NULL)
775 		goto err_out;
776 
777 	sch = qdisc_alloc(dev_queue, ops);
778 	if (IS_ERR(sch)) {
779 		err = PTR_ERR(sch);
780 		goto err_out2;
781 	}
782 
783 	sch->parent = parent;
784 
785 	if (handle == TC_H_INGRESS) {
786 		sch->flags |= TCQ_F_INGRESS;
787 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
788 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
789 	} else {
790 		if (handle == 0) {
791 			handle = qdisc_alloc_handle(dev);
792 			err = -ENOMEM;
793 			if (handle == 0)
794 				goto err_out3;
795 		}
796 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
797 	}
798 
799 	sch->handle = handle;
800 
801 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
802 		if (tca[TCA_STAB]) {
803 			stab = qdisc_get_stab(tca[TCA_STAB]);
804 			if (IS_ERR(stab)) {
805 				err = PTR_ERR(stab);
806 				goto err_out3;
807 			}
808 			sch->stab = stab;
809 		}
810 		if (tca[TCA_RATE]) {
811 			spinlock_t *root_lock;
812 
813 			if ((sch->parent != TC_H_ROOT) &&
814 			    !(sch->flags & TCQ_F_INGRESS))
815 				root_lock = qdisc_root_sleeping_lock(sch);
816 			else
817 				root_lock = qdisc_lock(sch);
818 
819 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
820 						root_lock, tca[TCA_RATE]);
821 			if (err) {
822 				/*
823 				 * Any broken qdiscs that would require
824 				 * a ops->reset() here? The qdisc was never
825 				 * in action so it shouldn't be necessary.
826 				 */
827 				if (ops->destroy)
828 					ops->destroy(sch);
829 				goto err_out3;
830 			}
831 		}
832 
833 		qdisc_list_add(sch);
834 
835 		return sch;
836 	}
837 err_out3:
838 	qdisc_put_stab(sch->stab);
839 	dev_put(dev);
840 	kfree((char *) sch - sch->padded);
841 err_out2:
842 	module_put(ops->owner);
843 err_out:
844 	*errp = err;
845 	return NULL;
846 }
847 
848 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
849 {
850 	struct qdisc_size_table *stab = NULL;
851 	int err = 0;
852 
853 	if (tca[TCA_OPTIONS]) {
854 		if (sch->ops->change == NULL)
855 			return -EINVAL;
856 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
857 		if (err)
858 			return err;
859 	}
860 
861 	if (tca[TCA_STAB]) {
862 		stab = qdisc_get_stab(tca[TCA_STAB]);
863 		if (IS_ERR(stab))
864 			return PTR_ERR(stab);
865 	}
866 
867 	qdisc_put_stab(sch->stab);
868 	sch->stab = stab;
869 
870 	if (tca[TCA_RATE])
871 		/* NB: ignores errors from replace_estimator
872 		   because change can't be undone. */
873 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
874 					    qdisc_root_sleeping_lock(sch),
875 					    tca[TCA_RATE]);
876 
877 	return 0;
878 }
879 
880 struct check_loop_arg
881 {
882 	struct qdisc_walker 	w;
883 	struct Qdisc		*p;
884 	int			depth;
885 };
886 
887 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
888 
889 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
890 {
891 	struct check_loop_arg	arg;
892 
893 	if (q->ops->cl_ops == NULL)
894 		return 0;
895 
896 	arg.w.stop = arg.w.skip = arg.w.count = 0;
897 	arg.w.fn = check_loop_fn;
898 	arg.depth = depth;
899 	arg.p = p;
900 	q->ops->cl_ops->walk(q, &arg.w);
901 	return arg.w.stop ? -ELOOP : 0;
902 }
903 
904 static int
905 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
906 {
907 	struct Qdisc *leaf;
908 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
909 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
910 
911 	leaf = cops->leaf(q, cl);
912 	if (leaf) {
913 		if (leaf == arg->p || arg->depth > 7)
914 			return -ELOOP;
915 		return check_loop(leaf, arg->p, arg->depth + 1);
916 	}
917 	return 0;
918 }
919 
920 /*
921  * Delete/get qdisc.
922  */
923 
924 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
925 {
926 	struct net *net = sock_net(skb->sk);
927 	struct tcmsg *tcm = NLMSG_DATA(n);
928 	struct nlattr *tca[TCA_MAX + 1];
929 	struct net_device *dev;
930 	u32 clid = tcm->tcm_parent;
931 	struct Qdisc *q = NULL;
932 	struct Qdisc *p = NULL;
933 	int err;
934 
935 	if (net != &init_net)
936 		return -EINVAL;
937 
938 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
939 		return -ENODEV;
940 
941 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
942 	if (err < 0)
943 		return err;
944 
945 	if (clid) {
946 		if (clid != TC_H_ROOT) {
947 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
948 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
949 					return -ENOENT;
950 				q = qdisc_leaf(p, clid);
951 			} else { /* ingress */
952 				q = dev->rx_queue.qdisc_sleeping;
953 			}
954 		} else {
955 			q = dev->qdisc;
956 		}
957 		if (!q)
958 			return -ENOENT;
959 
960 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
961 			return -EINVAL;
962 	} else {
963 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
964 			return -ENOENT;
965 	}
966 
967 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
968 		return -EINVAL;
969 
970 	if (n->nlmsg_type == RTM_DELQDISC) {
971 		if (!clid)
972 			return -EINVAL;
973 		if (q->handle == 0)
974 			return -ENOENT;
975 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
976 			return err;
977 	} else {
978 		qdisc_notify(skb, n, clid, NULL, q);
979 	}
980 	return 0;
981 }
982 
983 /*
984    Create/change qdisc.
985  */
986 
987 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
988 {
989 	struct net *net = sock_net(skb->sk);
990 	struct tcmsg *tcm;
991 	struct nlattr *tca[TCA_MAX + 1];
992 	struct net_device *dev;
993 	u32 clid;
994 	struct Qdisc *q, *p;
995 	int err;
996 
997 	if (net != &init_net)
998 		return -EINVAL;
999 
1000 replay:
1001 	/* Reinit, just in case something touches this. */
1002 	tcm = NLMSG_DATA(n);
1003 	clid = tcm->tcm_parent;
1004 	q = p = NULL;
1005 
1006 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1007 		return -ENODEV;
1008 
1009 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1010 	if (err < 0)
1011 		return err;
1012 
1013 	if (clid) {
1014 		if (clid != TC_H_ROOT) {
1015 			if (clid != TC_H_INGRESS) {
1016 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1017 					return -ENOENT;
1018 				q = qdisc_leaf(p, clid);
1019 			} else { /*ingress */
1020 				q = dev->rx_queue.qdisc_sleeping;
1021 			}
1022 		} else {
1023 			q = dev->qdisc;
1024 		}
1025 
1026 		/* It may be default qdisc, ignore it */
1027 		if (q && q->handle == 0)
1028 			q = NULL;
1029 
1030 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1031 			if (tcm->tcm_handle) {
1032 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1033 					return -EEXIST;
1034 				if (TC_H_MIN(tcm->tcm_handle))
1035 					return -EINVAL;
1036 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1037 					goto create_n_graft;
1038 				if (n->nlmsg_flags&NLM_F_EXCL)
1039 					return -EEXIST;
1040 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1041 					return -EINVAL;
1042 				if (q == p ||
1043 				    (p && check_loop(q, p, 0)))
1044 					return -ELOOP;
1045 				atomic_inc(&q->refcnt);
1046 				goto graft;
1047 			} else {
1048 				if (q == NULL)
1049 					goto create_n_graft;
1050 
1051 				/* This magic test requires explanation.
1052 				 *
1053 				 *   We know, that some child q is already
1054 				 *   attached to this parent and have choice:
1055 				 *   either to change it or to create/graft new one.
1056 				 *
1057 				 *   1. We are allowed to create/graft only
1058 				 *   if CREATE and REPLACE flags are set.
1059 				 *
1060 				 *   2. If EXCL is set, requestor wanted to say,
1061 				 *   that qdisc tcm_handle is not expected
1062 				 *   to exist, so that we choose create/graft too.
1063 				 *
1064 				 *   3. The last case is when no flags are set.
1065 				 *   Alas, it is sort of hole in API, we
1066 				 *   cannot decide what to do unambiguously.
1067 				 *   For now we select create/graft, if
1068 				 *   user gave KIND, which does not match existing.
1069 				 */
1070 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1071 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1072 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1073 				     (tca[TCA_KIND] &&
1074 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1075 					goto create_n_graft;
1076 			}
1077 		}
1078 	} else {
1079 		if (!tcm->tcm_handle)
1080 			return -EINVAL;
1081 		q = qdisc_lookup(dev, tcm->tcm_handle);
1082 	}
1083 
1084 	/* Change qdisc parameters */
1085 	if (q == NULL)
1086 		return -ENOENT;
1087 	if (n->nlmsg_flags&NLM_F_EXCL)
1088 		return -EEXIST;
1089 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1090 		return -EINVAL;
1091 	err = qdisc_change(q, tca);
1092 	if (err == 0)
1093 		qdisc_notify(skb, n, clid, NULL, q);
1094 	return err;
1095 
1096 create_n_graft:
1097 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1098 		return -ENOENT;
1099 	if (clid == TC_H_INGRESS)
1100 		q = qdisc_create(dev, &dev->rx_queue,
1101 				 tcm->tcm_parent, tcm->tcm_parent,
1102 				 tca, &err);
1103 	else {
1104 		unsigned int ntx = 0;
1105 
1106 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1107 			ntx = p->ops->cl_ops->select_queue(p, tcm);
1108 
1109 		q = qdisc_create(dev, netdev_get_tx_queue(dev, ntx),
1110 				 tcm->tcm_parent, tcm->tcm_handle,
1111 				 tca, &err);
1112 	}
1113 	if (q == NULL) {
1114 		if (err == -EAGAIN)
1115 			goto replay;
1116 		return err;
1117 	}
1118 
1119 graft:
1120 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1121 	if (err) {
1122 		if (q)
1123 			qdisc_destroy(q);
1124 		return err;
1125 	}
1126 
1127 	return 0;
1128 }
1129 
1130 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1131 			 u32 pid, u32 seq, u16 flags, int event)
1132 {
1133 	struct tcmsg *tcm;
1134 	struct nlmsghdr  *nlh;
1135 	unsigned char *b = skb_tail_pointer(skb);
1136 	struct gnet_dump d;
1137 
1138 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1139 	tcm = NLMSG_DATA(nlh);
1140 	tcm->tcm_family = AF_UNSPEC;
1141 	tcm->tcm__pad1 = 0;
1142 	tcm->tcm__pad2 = 0;
1143 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1144 	tcm->tcm_parent = clid;
1145 	tcm->tcm_handle = q->handle;
1146 	tcm->tcm_info = atomic_read(&q->refcnt);
1147 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1148 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1149 		goto nla_put_failure;
1150 	q->qstats.qlen = q->q.qlen;
1151 
1152 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1153 		goto nla_put_failure;
1154 
1155 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1156 					 qdisc_root_sleeping_lock(q), &d) < 0)
1157 		goto nla_put_failure;
1158 
1159 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1160 		goto nla_put_failure;
1161 
1162 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1163 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1164 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1165 		goto nla_put_failure;
1166 
1167 	if (gnet_stats_finish_copy(&d) < 0)
1168 		goto nla_put_failure;
1169 
1170 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1171 	return skb->len;
1172 
1173 nlmsg_failure:
1174 nla_put_failure:
1175 	nlmsg_trim(skb, b);
1176 	return -1;
1177 }
1178 
1179 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1180 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1181 {
1182 	struct sk_buff *skb;
1183 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1184 
1185 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1186 	if (!skb)
1187 		return -ENOBUFS;
1188 
1189 	if (old && old->handle) {
1190 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1191 			goto err_out;
1192 	}
1193 	if (new) {
1194 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1195 			goto err_out;
1196 	}
1197 
1198 	if (skb->len)
1199 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1200 
1201 err_out:
1202 	kfree_skb(skb);
1203 	return -EINVAL;
1204 }
1205 
1206 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1207 {
1208 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1209 }
1210 
1211 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1212 			      struct netlink_callback *cb,
1213 			      int *q_idx_p, int s_q_idx)
1214 {
1215 	int ret = 0, q_idx = *q_idx_p;
1216 	struct Qdisc *q;
1217 
1218 	if (!root)
1219 		return 0;
1220 
1221 	q = root;
1222 	if (q_idx < s_q_idx) {
1223 		q_idx++;
1224 	} else {
1225 		if (!tc_qdisc_dump_ignore(q) &&
1226 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1227 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1228 			goto done;
1229 		q_idx++;
1230 	}
1231 	list_for_each_entry(q, &root->list, list) {
1232 		if (q_idx < s_q_idx) {
1233 			q_idx++;
1234 			continue;
1235 		}
1236 		if (!tc_qdisc_dump_ignore(q) &&
1237 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1238 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1239 			goto done;
1240 		q_idx++;
1241 	}
1242 
1243 out:
1244 	*q_idx_p = q_idx;
1245 	return ret;
1246 done:
1247 	ret = -1;
1248 	goto out;
1249 }
1250 
1251 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1252 {
1253 	struct net *net = sock_net(skb->sk);
1254 	int idx, q_idx;
1255 	int s_idx, s_q_idx;
1256 	struct net_device *dev;
1257 
1258 	if (net != &init_net)
1259 		return 0;
1260 
1261 	s_idx = cb->args[0];
1262 	s_q_idx = q_idx = cb->args[1];
1263 	read_lock(&dev_base_lock);
1264 	idx = 0;
1265 	for_each_netdev(&init_net, dev) {
1266 		struct netdev_queue *dev_queue;
1267 
1268 		if (idx < s_idx)
1269 			goto cont;
1270 		if (idx > s_idx)
1271 			s_q_idx = 0;
1272 		q_idx = 0;
1273 
1274 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1275 			goto done;
1276 
1277 		dev_queue = &dev->rx_queue;
1278 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1279 			goto done;
1280 
1281 cont:
1282 		idx++;
1283 	}
1284 
1285 done:
1286 	read_unlock(&dev_base_lock);
1287 
1288 	cb->args[0] = idx;
1289 	cb->args[1] = q_idx;
1290 
1291 	return skb->len;
1292 }
1293 
1294 
1295 
1296 /************************************************
1297  *	Traffic classes manipulation.		*
1298  ************************************************/
1299 
1300 
1301 
1302 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1303 {
1304 	struct net *net = sock_net(skb->sk);
1305 	struct tcmsg *tcm = NLMSG_DATA(n);
1306 	struct nlattr *tca[TCA_MAX + 1];
1307 	struct net_device *dev;
1308 	struct Qdisc *q = NULL;
1309 	const struct Qdisc_class_ops *cops;
1310 	unsigned long cl = 0;
1311 	unsigned long new_cl;
1312 	u32 pid = tcm->tcm_parent;
1313 	u32 clid = tcm->tcm_handle;
1314 	u32 qid = TC_H_MAJ(clid);
1315 	int err;
1316 
1317 	if (net != &init_net)
1318 		return -EINVAL;
1319 
1320 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1321 		return -ENODEV;
1322 
1323 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1324 	if (err < 0)
1325 		return err;
1326 
1327 	/*
1328 	   parent == TC_H_UNSPEC - unspecified parent.
1329 	   parent == TC_H_ROOT   - class is root, which has no parent.
1330 	   parent == X:0	 - parent is root class.
1331 	   parent == X:Y	 - parent is a node in hierarchy.
1332 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1333 
1334 	   handle == 0:0	 - generate handle from kernel pool.
1335 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1336 	   handle == X:Y	 - clear.
1337 	   handle == X:0	 - root class.
1338 	 */
1339 
1340 	/* Step 1. Determine qdisc handle X:0 */
1341 
1342 	if (pid != TC_H_ROOT) {
1343 		u32 qid1 = TC_H_MAJ(pid);
1344 
1345 		if (qid && qid1) {
1346 			/* If both majors are known, they must be identical. */
1347 			if (qid != qid1)
1348 				return -EINVAL;
1349 		} else if (qid1) {
1350 			qid = qid1;
1351 		} else if (qid == 0)
1352 			qid = dev->qdisc->handle;
1353 
1354 		/* Now qid is genuine qdisc handle consistent
1355 		   both with parent and child.
1356 
1357 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1358 		 */
1359 		if (pid)
1360 			pid = TC_H_MAKE(qid, pid);
1361 	} else {
1362 		if (qid == 0)
1363 			qid = dev->qdisc->handle;
1364 	}
1365 
1366 	/* OK. Locate qdisc */
1367 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1368 		return -ENOENT;
1369 
1370 	/* An check that it supports classes */
1371 	cops = q->ops->cl_ops;
1372 	if (cops == NULL)
1373 		return -EINVAL;
1374 
1375 	/* Now try to get class */
1376 	if (clid == 0) {
1377 		if (pid == TC_H_ROOT)
1378 			clid = qid;
1379 	} else
1380 		clid = TC_H_MAKE(qid, clid);
1381 
1382 	if (clid)
1383 		cl = cops->get(q, clid);
1384 
1385 	if (cl == 0) {
1386 		err = -ENOENT;
1387 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1388 			goto out;
1389 	} else {
1390 		switch (n->nlmsg_type) {
1391 		case RTM_NEWTCLASS:
1392 			err = -EEXIST;
1393 			if (n->nlmsg_flags&NLM_F_EXCL)
1394 				goto out;
1395 			break;
1396 		case RTM_DELTCLASS:
1397 			err = -EOPNOTSUPP;
1398 			if (cops->delete)
1399 				err = cops->delete(q, cl);
1400 			if (err == 0)
1401 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1402 			goto out;
1403 		case RTM_GETTCLASS:
1404 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1405 			goto out;
1406 		default:
1407 			err = -EINVAL;
1408 			goto out;
1409 		}
1410 	}
1411 
1412 	new_cl = cl;
1413 	err = -EOPNOTSUPP;
1414 	if (cops->change)
1415 		err = cops->change(q, clid, pid, tca, &new_cl);
1416 	if (err == 0)
1417 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1418 
1419 out:
1420 	if (cl)
1421 		cops->put(q, cl);
1422 
1423 	return err;
1424 }
1425 
1426 
1427 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1428 			  unsigned long cl,
1429 			  u32 pid, u32 seq, u16 flags, int event)
1430 {
1431 	struct tcmsg *tcm;
1432 	struct nlmsghdr  *nlh;
1433 	unsigned char *b = skb_tail_pointer(skb);
1434 	struct gnet_dump d;
1435 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1436 
1437 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1438 	tcm = NLMSG_DATA(nlh);
1439 	tcm->tcm_family = AF_UNSPEC;
1440 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1441 	tcm->tcm_parent = q->handle;
1442 	tcm->tcm_handle = q->handle;
1443 	tcm->tcm_info = 0;
1444 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1445 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1446 		goto nla_put_failure;
1447 
1448 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1449 					 qdisc_root_sleeping_lock(q), &d) < 0)
1450 		goto nla_put_failure;
1451 
1452 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1453 		goto nla_put_failure;
1454 
1455 	if (gnet_stats_finish_copy(&d) < 0)
1456 		goto nla_put_failure;
1457 
1458 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1459 	return skb->len;
1460 
1461 nlmsg_failure:
1462 nla_put_failure:
1463 	nlmsg_trim(skb, b);
1464 	return -1;
1465 }
1466 
1467 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1468 			  struct Qdisc *q, unsigned long cl, int event)
1469 {
1470 	struct sk_buff *skb;
1471 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1472 
1473 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1474 	if (!skb)
1475 		return -ENOBUFS;
1476 
1477 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1478 		kfree_skb(skb);
1479 		return -EINVAL;
1480 	}
1481 
1482 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1483 }
1484 
1485 struct qdisc_dump_args
1486 {
1487 	struct qdisc_walker w;
1488 	struct sk_buff *skb;
1489 	struct netlink_callback *cb;
1490 };
1491 
1492 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1493 {
1494 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1495 
1496 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1497 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1498 }
1499 
1500 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1501 				struct tcmsg *tcm, struct netlink_callback *cb,
1502 				int *t_p, int s_t)
1503 {
1504 	struct qdisc_dump_args arg;
1505 
1506 	if (tc_qdisc_dump_ignore(q) ||
1507 	    *t_p < s_t || !q->ops->cl_ops ||
1508 	    (tcm->tcm_parent &&
1509 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1510 		(*t_p)++;
1511 		return 0;
1512 	}
1513 	if (*t_p > s_t)
1514 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1515 	arg.w.fn = qdisc_class_dump;
1516 	arg.skb = skb;
1517 	arg.cb = cb;
1518 	arg.w.stop  = 0;
1519 	arg.w.skip = cb->args[1];
1520 	arg.w.count = 0;
1521 	q->ops->cl_ops->walk(q, &arg.w);
1522 	cb->args[1] = arg.w.count;
1523 	if (arg.w.stop)
1524 		return -1;
1525 	(*t_p)++;
1526 	return 0;
1527 }
1528 
1529 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1530 			       struct tcmsg *tcm, struct netlink_callback *cb,
1531 			       int *t_p, int s_t)
1532 {
1533 	struct Qdisc *q;
1534 
1535 	if (!root)
1536 		return 0;
1537 
1538 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1539 		return -1;
1540 
1541 	list_for_each_entry(q, &root->list, list) {
1542 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1543 			return -1;
1544 	}
1545 
1546 	return 0;
1547 }
1548 
1549 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1550 {
1551 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1552 	struct net *net = sock_net(skb->sk);
1553 	struct netdev_queue *dev_queue;
1554 	struct net_device *dev;
1555 	int t, s_t;
1556 
1557 	if (net != &init_net)
1558 		return 0;
1559 
1560 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1561 		return 0;
1562 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1563 		return 0;
1564 
1565 	s_t = cb->args[0];
1566 	t = 0;
1567 
1568 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1569 		goto done;
1570 
1571 	dev_queue = &dev->rx_queue;
1572 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1573 		goto done;
1574 
1575 done:
1576 	cb->args[0] = t;
1577 
1578 	dev_put(dev);
1579 	return skb->len;
1580 }
1581 
1582 /* Main classifier routine: scans classifier chain attached
1583    to this qdisc, (optionally) tests for protocol and asks
1584    specific classifiers.
1585  */
1586 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1587 		       struct tcf_result *res)
1588 {
1589 	__be16 protocol = skb->protocol;
1590 	int err = 0;
1591 
1592 	for (; tp; tp = tp->next) {
1593 		if ((tp->protocol == protocol ||
1594 		     tp->protocol == htons(ETH_P_ALL)) &&
1595 		    (err = tp->classify(skb, tp, res)) >= 0) {
1596 #ifdef CONFIG_NET_CLS_ACT
1597 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1598 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1599 #endif
1600 			return err;
1601 		}
1602 	}
1603 	return -1;
1604 }
1605 EXPORT_SYMBOL(tc_classify_compat);
1606 
1607 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1608 		struct tcf_result *res)
1609 {
1610 	int err = 0;
1611 	__be16 protocol;
1612 #ifdef CONFIG_NET_CLS_ACT
1613 	struct tcf_proto *otp = tp;
1614 reclassify:
1615 #endif
1616 	protocol = skb->protocol;
1617 
1618 	err = tc_classify_compat(skb, tp, res);
1619 #ifdef CONFIG_NET_CLS_ACT
1620 	if (err == TC_ACT_RECLASSIFY) {
1621 		u32 verd = G_TC_VERD(skb->tc_verd);
1622 		tp = otp;
1623 
1624 		if (verd++ >= MAX_REC_LOOP) {
1625 			printk("rule prio %u protocol %02x reclassify loop, "
1626 			       "packet dropped\n",
1627 			       tp->prio&0xffff, ntohs(tp->protocol));
1628 			return TC_ACT_SHOT;
1629 		}
1630 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1631 		goto reclassify;
1632 	}
1633 #endif
1634 	return err;
1635 }
1636 EXPORT_SYMBOL(tc_classify);
1637 
1638 void tcf_destroy(struct tcf_proto *tp)
1639 {
1640 	tp->ops->destroy(tp);
1641 	module_put(tp->ops->owner);
1642 	kfree(tp);
1643 }
1644 
1645 void tcf_destroy_chain(struct tcf_proto **fl)
1646 {
1647 	struct tcf_proto *tp;
1648 
1649 	while ((tp = *fl) != NULL) {
1650 		*fl = tp->next;
1651 		tcf_destroy(tp);
1652 	}
1653 }
1654 EXPORT_SYMBOL(tcf_destroy_chain);
1655 
1656 #ifdef CONFIG_PROC_FS
1657 static int psched_show(struct seq_file *seq, void *v)
1658 {
1659 	struct timespec ts;
1660 
1661 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1662 	seq_printf(seq, "%08x %08x %08x %08x\n",
1663 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1664 		   1000000,
1665 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1666 
1667 	return 0;
1668 }
1669 
1670 static int psched_open(struct inode *inode, struct file *file)
1671 {
1672 	return single_open(file, psched_show, PDE(inode)->data);
1673 }
1674 
1675 static const struct file_operations psched_fops = {
1676 	.owner = THIS_MODULE,
1677 	.open = psched_open,
1678 	.read  = seq_read,
1679 	.llseek = seq_lseek,
1680 	.release = single_release,
1681 };
1682 #endif
1683 
1684 static int __init pktsched_init(void)
1685 {
1686 	register_qdisc(&pfifo_qdisc_ops);
1687 	register_qdisc(&bfifo_qdisc_ops);
1688 	register_qdisc(&mq_qdisc_ops);
1689 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1690 
1691 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1692 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1693 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1694 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1695 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1696 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1697 
1698 	return 0;
1699 }
1700 
1701 subsys_initcall(pktsched_init);
1702