xref: /openbmc/linux/net/sched/sch_api.c (revision b04b4f78)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
211 }
212 
213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	unsigned int i;
223 	struct Qdisc *q;
224 
225 	for (i = 0; i < dev->num_tx_queues; i++) {
226 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
227 		struct Qdisc *txq_root = txq->qdisc_sleeping;
228 
229 		q = qdisc_match_from_root(txq_root, handle);
230 		if (q)
231 			goto out;
232 	}
233 
234 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
235 out:
236 	return q;
237 }
238 
239 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
240 {
241 	unsigned long cl;
242 	struct Qdisc *leaf;
243 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
244 
245 	if (cops == NULL)
246 		return NULL;
247 	cl = cops->get(p, classid);
248 
249 	if (cl == 0)
250 		return NULL;
251 	leaf = cops->leaf(p, cl);
252 	cops->put(p, cl);
253 	return leaf;
254 }
255 
256 /* Find queueing discipline by name */
257 
258 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
259 {
260 	struct Qdisc_ops *q = NULL;
261 
262 	if (kind) {
263 		read_lock(&qdisc_mod_lock);
264 		for (q = qdisc_base; q; q = q->next) {
265 			if (nla_strcmp(kind, q->id) == 0) {
266 				if (!try_module_get(q->owner))
267 					q = NULL;
268 				break;
269 			}
270 		}
271 		read_unlock(&qdisc_mod_lock);
272 	}
273 	return q;
274 }
275 
276 static struct qdisc_rate_table *qdisc_rtab_list;
277 
278 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
279 {
280 	struct qdisc_rate_table *rtab;
281 
282 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
283 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
284 			rtab->refcnt++;
285 			return rtab;
286 		}
287 	}
288 
289 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
290 	    nla_len(tab) != TC_RTAB_SIZE)
291 		return NULL;
292 
293 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
294 	if (rtab) {
295 		rtab->rate = *r;
296 		rtab->refcnt = 1;
297 		memcpy(rtab->data, nla_data(tab), 1024);
298 		rtab->next = qdisc_rtab_list;
299 		qdisc_rtab_list = rtab;
300 	}
301 	return rtab;
302 }
303 EXPORT_SYMBOL(qdisc_get_rtab);
304 
305 void qdisc_put_rtab(struct qdisc_rate_table *tab)
306 {
307 	struct qdisc_rate_table *rtab, **rtabp;
308 
309 	if (!tab || --tab->refcnt)
310 		return;
311 
312 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
313 		if (rtab == tab) {
314 			*rtabp = rtab->next;
315 			kfree(rtab);
316 			return;
317 		}
318 	}
319 }
320 EXPORT_SYMBOL(qdisc_put_rtab);
321 
322 static LIST_HEAD(qdisc_stab_list);
323 static DEFINE_SPINLOCK(qdisc_stab_lock);
324 
325 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
326 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
327 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
328 };
329 
330 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
331 {
332 	struct nlattr *tb[TCA_STAB_MAX + 1];
333 	struct qdisc_size_table *stab;
334 	struct tc_sizespec *s;
335 	unsigned int tsize = 0;
336 	u16 *tab = NULL;
337 	int err;
338 
339 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
340 	if (err < 0)
341 		return ERR_PTR(err);
342 	if (!tb[TCA_STAB_BASE])
343 		return ERR_PTR(-EINVAL);
344 
345 	s = nla_data(tb[TCA_STAB_BASE]);
346 
347 	if (s->tsize > 0) {
348 		if (!tb[TCA_STAB_DATA])
349 			return ERR_PTR(-EINVAL);
350 		tab = nla_data(tb[TCA_STAB_DATA]);
351 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
352 	}
353 
354 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
355 		return ERR_PTR(-EINVAL);
356 
357 	spin_lock(&qdisc_stab_lock);
358 
359 	list_for_each_entry(stab, &qdisc_stab_list, list) {
360 		if (memcmp(&stab->szopts, s, sizeof(*s)))
361 			continue;
362 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
363 			continue;
364 		stab->refcnt++;
365 		spin_unlock(&qdisc_stab_lock);
366 		return stab;
367 	}
368 
369 	spin_unlock(&qdisc_stab_lock);
370 
371 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
372 	if (!stab)
373 		return ERR_PTR(-ENOMEM);
374 
375 	stab->refcnt = 1;
376 	stab->szopts = *s;
377 	if (tsize > 0)
378 		memcpy(stab->data, tab, tsize * sizeof(u16));
379 
380 	spin_lock(&qdisc_stab_lock);
381 	list_add_tail(&stab->list, &qdisc_stab_list);
382 	spin_unlock(&qdisc_stab_lock);
383 
384 	return stab;
385 }
386 
387 void qdisc_put_stab(struct qdisc_size_table *tab)
388 {
389 	if (!tab)
390 		return;
391 
392 	spin_lock(&qdisc_stab_lock);
393 
394 	if (--tab->refcnt == 0) {
395 		list_del(&tab->list);
396 		kfree(tab);
397 	}
398 
399 	spin_unlock(&qdisc_stab_lock);
400 }
401 EXPORT_SYMBOL(qdisc_put_stab);
402 
403 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
404 {
405 	struct nlattr *nest;
406 
407 	nest = nla_nest_start(skb, TCA_STAB);
408 	if (nest == NULL)
409 		goto nla_put_failure;
410 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
411 	nla_nest_end(skb, nest);
412 
413 	return skb->len;
414 
415 nla_put_failure:
416 	return -1;
417 }
418 
419 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
420 {
421 	int pkt_len, slot;
422 
423 	pkt_len = skb->len + stab->szopts.overhead;
424 	if (unlikely(!stab->szopts.tsize))
425 		goto out;
426 
427 	slot = pkt_len + stab->szopts.cell_align;
428 	if (unlikely(slot < 0))
429 		slot = 0;
430 
431 	slot >>= stab->szopts.cell_log;
432 	if (likely(slot < stab->szopts.tsize))
433 		pkt_len = stab->data[slot];
434 	else
435 		pkt_len = stab->data[stab->szopts.tsize - 1] *
436 				(slot / stab->szopts.tsize) +
437 				stab->data[slot % stab->szopts.tsize];
438 
439 	pkt_len <<= stab->szopts.size_log;
440 out:
441 	if (unlikely(pkt_len < 1))
442 		pkt_len = 1;
443 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
444 }
445 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
446 
447 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
448 {
449 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
450 		printk(KERN_WARNING
451 		       "%s: %s qdisc %X: is non-work-conserving?\n",
452 		       txt, qdisc->ops->id, qdisc->handle >> 16);
453 		qdisc->flags |= TCQ_F_WARN_NONWC;
454 	}
455 }
456 EXPORT_SYMBOL(qdisc_warn_nonwc);
457 
458 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
459 {
460 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
461 						 timer);
462 
463 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
464 	__netif_schedule(qdisc_root(wd->qdisc));
465 
466 	return HRTIMER_NORESTART;
467 }
468 
469 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
470 {
471 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
472 	wd->timer.function = qdisc_watchdog;
473 	wd->qdisc = qdisc;
474 }
475 EXPORT_SYMBOL(qdisc_watchdog_init);
476 
477 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
478 {
479 	ktime_t time;
480 
481 	if (test_bit(__QDISC_STATE_DEACTIVATED,
482 		     &qdisc_root_sleeping(wd->qdisc)->state))
483 		return;
484 
485 	wd->qdisc->flags |= TCQ_F_THROTTLED;
486 	time = ktime_set(0, 0);
487 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
488 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
489 }
490 EXPORT_SYMBOL(qdisc_watchdog_schedule);
491 
492 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
493 {
494 	hrtimer_cancel(&wd->timer);
495 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
496 }
497 EXPORT_SYMBOL(qdisc_watchdog_cancel);
498 
499 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
500 {
501 	unsigned int size = n * sizeof(struct hlist_head), i;
502 	struct hlist_head *h;
503 
504 	if (size <= PAGE_SIZE)
505 		h = kmalloc(size, GFP_KERNEL);
506 	else
507 		h = (struct hlist_head *)
508 			__get_free_pages(GFP_KERNEL, get_order(size));
509 
510 	if (h != NULL) {
511 		for (i = 0; i < n; i++)
512 			INIT_HLIST_HEAD(&h[i]);
513 	}
514 	return h;
515 }
516 
517 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
518 {
519 	unsigned int size = n * sizeof(struct hlist_head);
520 
521 	if (size <= PAGE_SIZE)
522 		kfree(h);
523 	else
524 		free_pages((unsigned long)h, get_order(size));
525 }
526 
527 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
528 {
529 	struct Qdisc_class_common *cl;
530 	struct hlist_node *n, *next;
531 	struct hlist_head *nhash, *ohash;
532 	unsigned int nsize, nmask, osize;
533 	unsigned int i, h;
534 
535 	/* Rehash when load factor exceeds 0.75 */
536 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
537 		return;
538 	nsize = clhash->hashsize * 2;
539 	nmask = nsize - 1;
540 	nhash = qdisc_class_hash_alloc(nsize);
541 	if (nhash == NULL)
542 		return;
543 
544 	ohash = clhash->hash;
545 	osize = clhash->hashsize;
546 
547 	sch_tree_lock(sch);
548 	for (i = 0; i < osize; i++) {
549 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
550 			h = qdisc_class_hash(cl->classid, nmask);
551 			hlist_add_head(&cl->hnode, &nhash[h]);
552 		}
553 	}
554 	clhash->hash     = nhash;
555 	clhash->hashsize = nsize;
556 	clhash->hashmask = nmask;
557 	sch_tree_unlock(sch);
558 
559 	qdisc_class_hash_free(ohash, osize);
560 }
561 EXPORT_SYMBOL(qdisc_class_hash_grow);
562 
563 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
564 {
565 	unsigned int size = 4;
566 
567 	clhash->hash = qdisc_class_hash_alloc(size);
568 	if (clhash->hash == NULL)
569 		return -ENOMEM;
570 	clhash->hashsize  = size;
571 	clhash->hashmask  = size - 1;
572 	clhash->hashelems = 0;
573 	return 0;
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_init);
576 
577 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
578 {
579 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
580 }
581 EXPORT_SYMBOL(qdisc_class_hash_destroy);
582 
583 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
584 			     struct Qdisc_class_common *cl)
585 {
586 	unsigned int h;
587 
588 	INIT_HLIST_NODE(&cl->hnode);
589 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
590 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
591 	clhash->hashelems++;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_insert);
594 
595 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
596 			     struct Qdisc_class_common *cl)
597 {
598 	hlist_del(&cl->hnode);
599 	clhash->hashelems--;
600 }
601 EXPORT_SYMBOL(qdisc_class_hash_remove);
602 
603 /* Allocate an unique handle from space managed by kernel */
604 
605 static u32 qdisc_alloc_handle(struct net_device *dev)
606 {
607 	int i = 0x10000;
608 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
609 
610 	do {
611 		autohandle += TC_H_MAKE(0x10000U, 0);
612 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
613 			autohandle = TC_H_MAKE(0x80000000U, 0);
614 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
615 
616 	return i>0 ? autohandle : 0;
617 }
618 
619 /* Attach toplevel qdisc to device queue. */
620 
621 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
622 				     struct Qdisc *qdisc)
623 {
624 	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
625 	spinlock_t *root_lock;
626 
627 	root_lock = qdisc_lock(oqdisc);
628 	spin_lock_bh(root_lock);
629 
630 	/* Prune old scheduler */
631 	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
632 		qdisc_reset(oqdisc);
633 
634 	/* ... and graft new one */
635 	if (qdisc == NULL)
636 		qdisc = &noop_qdisc;
637 	dev_queue->qdisc_sleeping = qdisc;
638 	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
639 
640 	spin_unlock_bh(root_lock);
641 
642 	return oqdisc;
643 }
644 
645 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
646 {
647 	const struct Qdisc_class_ops *cops;
648 	unsigned long cl;
649 	u32 parentid;
650 
651 	if (n == 0)
652 		return;
653 	while ((parentid = sch->parent)) {
654 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
655 			return;
656 
657 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
658 		if (sch == NULL) {
659 			WARN_ON(parentid != TC_H_ROOT);
660 			return;
661 		}
662 		cops = sch->ops->cl_ops;
663 		if (cops->qlen_notify) {
664 			cl = cops->get(sch, parentid);
665 			cops->qlen_notify(sch, cl);
666 			cops->put(sch, cl);
667 		}
668 		sch->q.qlen -= n;
669 	}
670 }
671 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
672 
673 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
674 			       struct Qdisc *old, struct Qdisc *new)
675 {
676 	if (new || old)
677 		qdisc_notify(skb, n, clid, old, new);
678 
679 	if (old)
680 		qdisc_destroy(old);
681 }
682 
683 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
684  * to device "dev".
685  *
686  * When appropriate send a netlink notification using 'skb'
687  * and "n".
688  *
689  * On success, destroy old qdisc.
690  */
691 
692 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
693 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
694 		       struct Qdisc *new, struct Qdisc *old)
695 {
696 	struct Qdisc *q = old;
697 	int err = 0;
698 
699 	if (parent == NULL) {
700 		unsigned int i, num_q, ingress;
701 
702 		ingress = 0;
703 		num_q = dev->num_tx_queues;
704 		if ((q && q->flags & TCQ_F_INGRESS) ||
705 		    (new && new->flags & TCQ_F_INGRESS)) {
706 			num_q = 1;
707 			ingress = 1;
708 		}
709 
710 		if (dev->flags & IFF_UP)
711 			dev_deactivate(dev);
712 
713 		for (i = 0; i < num_q; i++) {
714 			struct netdev_queue *dev_queue = &dev->rx_queue;
715 
716 			if (!ingress)
717 				dev_queue = netdev_get_tx_queue(dev, i);
718 
719 			old = dev_graft_qdisc(dev_queue, new);
720 			if (new && i > 0)
721 				atomic_inc(&new->refcnt);
722 
723 			notify_and_destroy(skb, n, classid, old, new);
724 		}
725 
726 		if (dev->flags & IFF_UP)
727 			dev_activate(dev);
728 	} else {
729 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
730 
731 		err = -EINVAL;
732 
733 		if (cops) {
734 			unsigned long cl = cops->get(parent, classid);
735 			if (cl) {
736 				err = cops->graft(parent, cl, new, &old);
737 				cops->put(parent, cl);
738 			}
739 		}
740 		if (!err)
741 			notify_and_destroy(skb, n, classid, old, new);
742 	}
743 	return err;
744 }
745 
746 /* lockdep annotation is needed for ingress; egress gets it only for name */
747 static struct lock_class_key qdisc_tx_lock;
748 static struct lock_class_key qdisc_rx_lock;
749 
750 /*
751    Allocate and initialize new qdisc.
752 
753    Parameters are passed via opt.
754  */
755 
756 static struct Qdisc *
757 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
758 	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
759 {
760 	int err;
761 	struct nlattr *kind = tca[TCA_KIND];
762 	struct Qdisc *sch;
763 	struct Qdisc_ops *ops;
764 	struct qdisc_size_table *stab;
765 
766 	ops = qdisc_lookup_ops(kind);
767 #ifdef CONFIG_MODULES
768 	if (ops == NULL && kind != NULL) {
769 		char name[IFNAMSIZ];
770 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
771 			/* We dropped the RTNL semaphore in order to
772 			 * perform the module load.  So, even if we
773 			 * succeeded in loading the module we have to
774 			 * tell the caller to replay the request.  We
775 			 * indicate this using -EAGAIN.
776 			 * We replay the request because the device may
777 			 * go away in the mean time.
778 			 */
779 			rtnl_unlock();
780 			request_module("sch_%s", name);
781 			rtnl_lock();
782 			ops = qdisc_lookup_ops(kind);
783 			if (ops != NULL) {
784 				/* We will try again qdisc_lookup_ops,
785 				 * so don't keep a reference.
786 				 */
787 				module_put(ops->owner);
788 				err = -EAGAIN;
789 				goto err_out;
790 			}
791 		}
792 	}
793 #endif
794 
795 	err = -ENOENT;
796 	if (ops == NULL)
797 		goto err_out;
798 
799 	sch = qdisc_alloc(dev_queue, ops);
800 	if (IS_ERR(sch)) {
801 		err = PTR_ERR(sch);
802 		goto err_out2;
803 	}
804 
805 	sch->parent = parent;
806 
807 	if (handle == TC_H_INGRESS) {
808 		sch->flags |= TCQ_F_INGRESS;
809 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
810 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
811 	} else {
812 		if (handle == 0) {
813 			handle = qdisc_alloc_handle(dev);
814 			err = -ENOMEM;
815 			if (handle == 0)
816 				goto err_out3;
817 		}
818 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
819 	}
820 
821 	sch->handle = handle;
822 
823 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
824 		if (tca[TCA_STAB]) {
825 			stab = qdisc_get_stab(tca[TCA_STAB]);
826 			if (IS_ERR(stab)) {
827 				err = PTR_ERR(stab);
828 				goto err_out3;
829 			}
830 			sch->stab = stab;
831 		}
832 		if (tca[TCA_RATE]) {
833 			spinlock_t *root_lock;
834 
835 			if ((sch->parent != TC_H_ROOT) &&
836 			    !(sch->flags & TCQ_F_INGRESS))
837 				root_lock = qdisc_root_sleeping_lock(sch);
838 			else
839 				root_lock = qdisc_lock(sch);
840 
841 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
842 						root_lock, tca[TCA_RATE]);
843 			if (err) {
844 				/*
845 				 * Any broken qdiscs that would require
846 				 * a ops->reset() here? The qdisc was never
847 				 * in action so it shouldn't be necessary.
848 				 */
849 				if (ops->destroy)
850 					ops->destroy(sch);
851 				goto err_out3;
852 			}
853 		}
854 
855 		qdisc_list_add(sch);
856 
857 		return sch;
858 	}
859 err_out3:
860 	qdisc_put_stab(sch->stab);
861 	dev_put(dev);
862 	kfree((char *) sch - sch->padded);
863 err_out2:
864 	module_put(ops->owner);
865 err_out:
866 	*errp = err;
867 	return NULL;
868 }
869 
870 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
871 {
872 	struct qdisc_size_table *stab = NULL;
873 	int err = 0;
874 
875 	if (tca[TCA_OPTIONS]) {
876 		if (sch->ops->change == NULL)
877 			return -EINVAL;
878 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
879 		if (err)
880 			return err;
881 	}
882 
883 	if (tca[TCA_STAB]) {
884 		stab = qdisc_get_stab(tca[TCA_STAB]);
885 		if (IS_ERR(stab))
886 			return PTR_ERR(stab);
887 	}
888 
889 	qdisc_put_stab(sch->stab);
890 	sch->stab = stab;
891 
892 	if (tca[TCA_RATE])
893 		/* NB: ignores errors from replace_estimator
894 		   because change can't be undone. */
895 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
896 					    qdisc_root_sleeping_lock(sch),
897 					    tca[TCA_RATE]);
898 
899 	return 0;
900 }
901 
902 struct check_loop_arg
903 {
904 	struct qdisc_walker 	w;
905 	struct Qdisc		*p;
906 	int			depth;
907 };
908 
909 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
910 
911 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
912 {
913 	struct check_loop_arg	arg;
914 
915 	if (q->ops->cl_ops == NULL)
916 		return 0;
917 
918 	arg.w.stop = arg.w.skip = arg.w.count = 0;
919 	arg.w.fn = check_loop_fn;
920 	arg.depth = depth;
921 	arg.p = p;
922 	q->ops->cl_ops->walk(q, &arg.w);
923 	return arg.w.stop ? -ELOOP : 0;
924 }
925 
926 static int
927 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
928 {
929 	struct Qdisc *leaf;
930 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
931 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
932 
933 	leaf = cops->leaf(q, cl);
934 	if (leaf) {
935 		if (leaf == arg->p || arg->depth > 7)
936 			return -ELOOP;
937 		return check_loop(leaf, arg->p, arg->depth + 1);
938 	}
939 	return 0;
940 }
941 
942 /*
943  * Delete/get qdisc.
944  */
945 
946 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
947 {
948 	struct net *net = sock_net(skb->sk);
949 	struct tcmsg *tcm = NLMSG_DATA(n);
950 	struct nlattr *tca[TCA_MAX + 1];
951 	struct net_device *dev;
952 	u32 clid = tcm->tcm_parent;
953 	struct Qdisc *q = NULL;
954 	struct Qdisc *p = NULL;
955 	int err;
956 
957 	if (net != &init_net)
958 		return -EINVAL;
959 
960 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
961 		return -ENODEV;
962 
963 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
964 	if (err < 0)
965 		return err;
966 
967 	if (clid) {
968 		if (clid != TC_H_ROOT) {
969 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
970 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
971 					return -ENOENT;
972 				q = qdisc_leaf(p, clid);
973 			} else { /* ingress */
974 				q = dev->rx_queue.qdisc_sleeping;
975 			}
976 		} else {
977 			struct netdev_queue *dev_queue;
978 			dev_queue = netdev_get_tx_queue(dev, 0);
979 			q = dev_queue->qdisc_sleeping;
980 		}
981 		if (!q)
982 			return -ENOENT;
983 
984 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
985 			return -EINVAL;
986 	} else {
987 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
988 			return -ENOENT;
989 	}
990 
991 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
992 		return -EINVAL;
993 
994 	if (n->nlmsg_type == RTM_DELQDISC) {
995 		if (!clid)
996 			return -EINVAL;
997 		if (q->handle == 0)
998 			return -ENOENT;
999 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1000 			return err;
1001 	} else {
1002 		qdisc_notify(skb, n, clid, NULL, q);
1003 	}
1004 	return 0;
1005 }
1006 
1007 /*
1008    Create/change qdisc.
1009  */
1010 
1011 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1012 {
1013 	struct net *net = sock_net(skb->sk);
1014 	struct tcmsg *tcm;
1015 	struct nlattr *tca[TCA_MAX + 1];
1016 	struct net_device *dev;
1017 	u32 clid;
1018 	struct Qdisc *q, *p;
1019 	int err;
1020 
1021 	if (net != &init_net)
1022 		return -EINVAL;
1023 
1024 replay:
1025 	/* Reinit, just in case something touches this. */
1026 	tcm = NLMSG_DATA(n);
1027 	clid = tcm->tcm_parent;
1028 	q = p = NULL;
1029 
1030 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1031 		return -ENODEV;
1032 
1033 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1034 	if (err < 0)
1035 		return err;
1036 
1037 	if (clid) {
1038 		if (clid != TC_H_ROOT) {
1039 			if (clid != TC_H_INGRESS) {
1040 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1041 					return -ENOENT;
1042 				q = qdisc_leaf(p, clid);
1043 			} else { /*ingress */
1044 				q = dev->rx_queue.qdisc_sleeping;
1045 			}
1046 		} else {
1047 			struct netdev_queue *dev_queue;
1048 			dev_queue = netdev_get_tx_queue(dev, 0);
1049 			q = dev_queue->qdisc_sleeping;
1050 		}
1051 
1052 		/* It may be default qdisc, ignore it */
1053 		if (q && q->handle == 0)
1054 			q = NULL;
1055 
1056 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1057 			if (tcm->tcm_handle) {
1058 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1059 					return -EEXIST;
1060 				if (TC_H_MIN(tcm->tcm_handle))
1061 					return -EINVAL;
1062 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1063 					goto create_n_graft;
1064 				if (n->nlmsg_flags&NLM_F_EXCL)
1065 					return -EEXIST;
1066 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1067 					return -EINVAL;
1068 				if (q == p ||
1069 				    (p && check_loop(q, p, 0)))
1070 					return -ELOOP;
1071 				atomic_inc(&q->refcnt);
1072 				goto graft;
1073 			} else {
1074 				if (q == NULL)
1075 					goto create_n_graft;
1076 
1077 				/* This magic test requires explanation.
1078 				 *
1079 				 *   We know, that some child q is already
1080 				 *   attached to this parent and have choice:
1081 				 *   either to change it or to create/graft new one.
1082 				 *
1083 				 *   1. We are allowed to create/graft only
1084 				 *   if CREATE and REPLACE flags are set.
1085 				 *
1086 				 *   2. If EXCL is set, requestor wanted to say,
1087 				 *   that qdisc tcm_handle is not expected
1088 				 *   to exist, so that we choose create/graft too.
1089 				 *
1090 				 *   3. The last case is when no flags are set.
1091 				 *   Alas, it is sort of hole in API, we
1092 				 *   cannot decide what to do unambiguously.
1093 				 *   For now we select create/graft, if
1094 				 *   user gave KIND, which does not match existing.
1095 				 */
1096 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1097 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1098 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1099 				     (tca[TCA_KIND] &&
1100 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1101 					goto create_n_graft;
1102 			}
1103 		}
1104 	} else {
1105 		if (!tcm->tcm_handle)
1106 			return -EINVAL;
1107 		q = qdisc_lookup(dev, tcm->tcm_handle);
1108 	}
1109 
1110 	/* Change qdisc parameters */
1111 	if (q == NULL)
1112 		return -ENOENT;
1113 	if (n->nlmsg_flags&NLM_F_EXCL)
1114 		return -EEXIST;
1115 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1116 		return -EINVAL;
1117 	err = qdisc_change(q, tca);
1118 	if (err == 0)
1119 		qdisc_notify(skb, n, clid, NULL, q);
1120 	return err;
1121 
1122 create_n_graft:
1123 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1124 		return -ENOENT;
1125 	if (clid == TC_H_INGRESS)
1126 		q = qdisc_create(dev, &dev->rx_queue,
1127 				 tcm->tcm_parent, tcm->tcm_parent,
1128 				 tca, &err);
1129 	else
1130 		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1131 				 tcm->tcm_parent, tcm->tcm_handle,
1132 				 tca, &err);
1133 	if (q == NULL) {
1134 		if (err == -EAGAIN)
1135 			goto replay;
1136 		return err;
1137 	}
1138 
1139 graft:
1140 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1141 	if (err) {
1142 		if (q)
1143 			qdisc_destroy(q);
1144 		return err;
1145 	}
1146 
1147 	return 0;
1148 }
1149 
1150 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1151 			 u32 pid, u32 seq, u16 flags, int event)
1152 {
1153 	struct tcmsg *tcm;
1154 	struct nlmsghdr  *nlh;
1155 	unsigned char *b = skb_tail_pointer(skb);
1156 	struct gnet_dump d;
1157 
1158 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1159 	tcm = NLMSG_DATA(nlh);
1160 	tcm->tcm_family = AF_UNSPEC;
1161 	tcm->tcm__pad1 = 0;
1162 	tcm->tcm__pad2 = 0;
1163 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1164 	tcm->tcm_parent = clid;
1165 	tcm->tcm_handle = q->handle;
1166 	tcm->tcm_info = atomic_read(&q->refcnt);
1167 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1168 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1169 		goto nla_put_failure;
1170 	q->qstats.qlen = q->q.qlen;
1171 
1172 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1173 		goto nla_put_failure;
1174 
1175 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1176 					 qdisc_root_sleeping_lock(q), &d) < 0)
1177 		goto nla_put_failure;
1178 
1179 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1180 		goto nla_put_failure;
1181 
1182 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1183 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1184 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1185 		goto nla_put_failure;
1186 
1187 	if (gnet_stats_finish_copy(&d) < 0)
1188 		goto nla_put_failure;
1189 
1190 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1191 	return skb->len;
1192 
1193 nlmsg_failure:
1194 nla_put_failure:
1195 	nlmsg_trim(skb, b);
1196 	return -1;
1197 }
1198 
1199 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1200 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1201 {
1202 	struct sk_buff *skb;
1203 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1204 
1205 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1206 	if (!skb)
1207 		return -ENOBUFS;
1208 
1209 	if (old && old->handle) {
1210 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1211 			goto err_out;
1212 	}
1213 	if (new) {
1214 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1215 			goto err_out;
1216 	}
1217 
1218 	if (skb->len)
1219 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1220 
1221 err_out:
1222 	kfree_skb(skb);
1223 	return -EINVAL;
1224 }
1225 
1226 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1227 {
1228 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1229 }
1230 
1231 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1232 			      struct netlink_callback *cb,
1233 			      int *q_idx_p, int s_q_idx)
1234 {
1235 	int ret = 0, q_idx = *q_idx_p;
1236 	struct Qdisc *q;
1237 
1238 	if (!root)
1239 		return 0;
1240 
1241 	q = root;
1242 	if (q_idx < s_q_idx) {
1243 		q_idx++;
1244 	} else {
1245 		if (!tc_qdisc_dump_ignore(q) &&
1246 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248 			goto done;
1249 		q_idx++;
1250 	}
1251 	list_for_each_entry(q, &root->list, list) {
1252 		if (q_idx < s_q_idx) {
1253 			q_idx++;
1254 			continue;
1255 		}
1256 		if (!tc_qdisc_dump_ignore(q) &&
1257 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1258 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1259 			goto done;
1260 		q_idx++;
1261 	}
1262 
1263 out:
1264 	*q_idx_p = q_idx;
1265 	return ret;
1266 done:
1267 	ret = -1;
1268 	goto out;
1269 }
1270 
1271 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1272 {
1273 	struct net *net = sock_net(skb->sk);
1274 	int idx, q_idx;
1275 	int s_idx, s_q_idx;
1276 	struct net_device *dev;
1277 
1278 	if (net != &init_net)
1279 		return 0;
1280 
1281 	s_idx = cb->args[0];
1282 	s_q_idx = q_idx = cb->args[1];
1283 	read_lock(&dev_base_lock);
1284 	idx = 0;
1285 	for_each_netdev(&init_net, dev) {
1286 		struct netdev_queue *dev_queue;
1287 
1288 		if (idx < s_idx)
1289 			goto cont;
1290 		if (idx > s_idx)
1291 			s_q_idx = 0;
1292 		q_idx = 0;
1293 
1294 		dev_queue = netdev_get_tx_queue(dev, 0);
1295 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1296 			goto done;
1297 
1298 		dev_queue = &dev->rx_queue;
1299 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1300 			goto done;
1301 
1302 cont:
1303 		idx++;
1304 	}
1305 
1306 done:
1307 	read_unlock(&dev_base_lock);
1308 
1309 	cb->args[0] = idx;
1310 	cb->args[1] = q_idx;
1311 
1312 	return skb->len;
1313 }
1314 
1315 
1316 
1317 /************************************************
1318  *	Traffic classes manipulation.		*
1319  ************************************************/
1320 
1321 
1322 
1323 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1324 {
1325 	struct net *net = sock_net(skb->sk);
1326 	struct netdev_queue *dev_queue;
1327 	struct tcmsg *tcm = NLMSG_DATA(n);
1328 	struct nlattr *tca[TCA_MAX + 1];
1329 	struct net_device *dev;
1330 	struct Qdisc *q = NULL;
1331 	const struct Qdisc_class_ops *cops;
1332 	unsigned long cl = 0;
1333 	unsigned long new_cl;
1334 	u32 pid = tcm->tcm_parent;
1335 	u32 clid = tcm->tcm_handle;
1336 	u32 qid = TC_H_MAJ(clid);
1337 	int err;
1338 
1339 	if (net != &init_net)
1340 		return -EINVAL;
1341 
1342 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1343 		return -ENODEV;
1344 
1345 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1346 	if (err < 0)
1347 		return err;
1348 
1349 	/*
1350 	   parent == TC_H_UNSPEC - unspecified parent.
1351 	   parent == TC_H_ROOT   - class is root, which has no parent.
1352 	   parent == X:0	 - parent is root class.
1353 	   parent == X:Y	 - parent is a node in hierarchy.
1354 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1355 
1356 	   handle == 0:0	 - generate handle from kernel pool.
1357 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1358 	   handle == X:Y	 - clear.
1359 	   handle == X:0	 - root class.
1360 	 */
1361 
1362 	/* Step 1. Determine qdisc handle X:0 */
1363 
1364 	dev_queue = netdev_get_tx_queue(dev, 0);
1365 	if (pid != TC_H_ROOT) {
1366 		u32 qid1 = TC_H_MAJ(pid);
1367 
1368 		if (qid && qid1) {
1369 			/* If both majors are known, they must be identical. */
1370 			if (qid != qid1)
1371 				return -EINVAL;
1372 		} else if (qid1) {
1373 			qid = qid1;
1374 		} else if (qid == 0)
1375 			qid = dev_queue->qdisc_sleeping->handle;
1376 
1377 		/* Now qid is genuine qdisc handle consistent
1378 		   both with parent and child.
1379 
1380 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1381 		 */
1382 		if (pid)
1383 			pid = TC_H_MAKE(qid, pid);
1384 	} else {
1385 		if (qid == 0)
1386 			qid = dev_queue->qdisc_sleeping->handle;
1387 	}
1388 
1389 	/* OK. Locate qdisc */
1390 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1391 		return -ENOENT;
1392 
1393 	/* An check that it supports classes */
1394 	cops = q->ops->cl_ops;
1395 	if (cops == NULL)
1396 		return -EINVAL;
1397 
1398 	/* Now try to get class */
1399 	if (clid == 0) {
1400 		if (pid == TC_H_ROOT)
1401 			clid = qid;
1402 	} else
1403 		clid = TC_H_MAKE(qid, clid);
1404 
1405 	if (clid)
1406 		cl = cops->get(q, clid);
1407 
1408 	if (cl == 0) {
1409 		err = -ENOENT;
1410 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1411 			goto out;
1412 	} else {
1413 		switch (n->nlmsg_type) {
1414 		case RTM_NEWTCLASS:
1415 			err = -EEXIST;
1416 			if (n->nlmsg_flags&NLM_F_EXCL)
1417 				goto out;
1418 			break;
1419 		case RTM_DELTCLASS:
1420 			err = cops->delete(q, cl);
1421 			if (err == 0)
1422 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1423 			goto out;
1424 		case RTM_GETTCLASS:
1425 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1426 			goto out;
1427 		default:
1428 			err = -EINVAL;
1429 			goto out;
1430 		}
1431 	}
1432 
1433 	new_cl = cl;
1434 	err = cops->change(q, clid, pid, tca, &new_cl);
1435 	if (err == 0)
1436 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1437 
1438 out:
1439 	if (cl)
1440 		cops->put(q, cl);
1441 
1442 	return err;
1443 }
1444 
1445 
1446 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1447 			  unsigned long cl,
1448 			  u32 pid, u32 seq, u16 flags, int event)
1449 {
1450 	struct tcmsg *tcm;
1451 	struct nlmsghdr  *nlh;
1452 	unsigned char *b = skb_tail_pointer(skb);
1453 	struct gnet_dump d;
1454 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1455 
1456 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1457 	tcm = NLMSG_DATA(nlh);
1458 	tcm->tcm_family = AF_UNSPEC;
1459 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1460 	tcm->tcm_parent = q->handle;
1461 	tcm->tcm_handle = q->handle;
1462 	tcm->tcm_info = 0;
1463 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1464 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1465 		goto nla_put_failure;
1466 
1467 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1468 					 qdisc_root_sleeping_lock(q), &d) < 0)
1469 		goto nla_put_failure;
1470 
1471 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1472 		goto nla_put_failure;
1473 
1474 	if (gnet_stats_finish_copy(&d) < 0)
1475 		goto nla_put_failure;
1476 
1477 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1478 	return skb->len;
1479 
1480 nlmsg_failure:
1481 nla_put_failure:
1482 	nlmsg_trim(skb, b);
1483 	return -1;
1484 }
1485 
1486 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1487 			  struct Qdisc *q, unsigned long cl, int event)
1488 {
1489 	struct sk_buff *skb;
1490 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1491 
1492 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1493 	if (!skb)
1494 		return -ENOBUFS;
1495 
1496 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1497 		kfree_skb(skb);
1498 		return -EINVAL;
1499 	}
1500 
1501 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1502 }
1503 
1504 struct qdisc_dump_args
1505 {
1506 	struct qdisc_walker w;
1507 	struct sk_buff *skb;
1508 	struct netlink_callback *cb;
1509 };
1510 
1511 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1512 {
1513 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1514 
1515 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1516 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1517 }
1518 
1519 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1520 				struct tcmsg *tcm, struct netlink_callback *cb,
1521 				int *t_p, int s_t)
1522 {
1523 	struct qdisc_dump_args arg;
1524 
1525 	if (tc_qdisc_dump_ignore(q) ||
1526 	    *t_p < s_t || !q->ops->cl_ops ||
1527 	    (tcm->tcm_parent &&
1528 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1529 		(*t_p)++;
1530 		return 0;
1531 	}
1532 	if (*t_p > s_t)
1533 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1534 	arg.w.fn = qdisc_class_dump;
1535 	arg.skb = skb;
1536 	arg.cb = cb;
1537 	arg.w.stop  = 0;
1538 	arg.w.skip = cb->args[1];
1539 	arg.w.count = 0;
1540 	q->ops->cl_ops->walk(q, &arg.w);
1541 	cb->args[1] = arg.w.count;
1542 	if (arg.w.stop)
1543 		return -1;
1544 	(*t_p)++;
1545 	return 0;
1546 }
1547 
1548 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1549 			       struct tcmsg *tcm, struct netlink_callback *cb,
1550 			       int *t_p, int s_t)
1551 {
1552 	struct Qdisc *q;
1553 
1554 	if (!root)
1555 		return 0;
1556 
1557 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1558 		return -1;
1559 
1560 	list_for_each_entry(q, &root->list, list) {
1561 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1562 			return -1;
1563 	}
1564 
1565 	return 0;
1566 }
1567 
1568 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1569 {
1570 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1571 	struct net *net = sock_net(skb->sk);
1572 	struct netdev_queue *dev_queue;
1573 	struct net_device *dev;
1574 	int t, s_t;
1575 
1576 	if (net != &init_net)
1577 		return 0;
1578 
1579 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1580 		return 0;
1581 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1582 		return 0;
1583 
1584 	s_t = cb->args[0];
1585 	t = 0;
1586 
1587 	dev_queue = netdev_get_tx_queue(dev, 0);
1588 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1589 		goto done;
1590 
1591 	dev_queue = &dev->rx_queue;
1592 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1593 		goto done;
1594 
1595 done:
1596 	cb->args[0] = t;
1597 
1598 	dev_put(dev);
1599 	return skb->len;
1600 }
1601 
1602 /* Main classifier routine: scans classifier chain attached
1603    to this qdisc, (optionally) tests for protocol and asks
1604    specific classifiers.
1605  */
1606 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1607 		       struct tcf_result *res)
1608 {
1609 	__be16 protocol = skb->protocol;
1610 	int err = 0;
1611 
1612 	for (; tp; tp = tp->next) {
1613 		if ((tp->protocol == protocol ||
1614 		     tp->protocol == htons(ETH_P_ALL)) &&
1615 		    (err = tp->classify(skb, tp, res)) >= 0) {
1616 #ifdef CONFIG_NET_CLS_ACT
1617 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1618 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1619 #endif
1620 			return err;
1621 		}
1622 	}
1623 	return -1;
1624 }
1625 EXPORT_SYMBOL(tc_classify_compat);
1626 
1627 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1628 		struct tcf_result *res)
1629 {
1630 	int err = 0;
1631 	__be16 protocol;
1632 #ifdef CONFIG_NET_CLS_ACT
1633 	struct tcf_proto *otp = tp;
1634 reclassify:
1635 #endif
1636 	protocol = skb->protocol;
1637 
1638 	err = tc_classify_compat(skb, tp, res);
1639 #ifdef CONFIG_NET_CLS_ACT
1640 	if (err == TC_ACT_RECLASSIFY) {
1641 		u32 verd = G_TC_VERD(skb->tc_verd);
1642 		tp = otp;
1643 
1644 		if (verd++ >= MAX_REC_LOOP) {
1645 			printk("rule prio %u protocol %02x reclassify loop, "
1646 			       "packet dropped\n",
1647 			       tp->prio&0xffff, ntohs(tp->protocol));
1648 			return TC_ACT_SHOT;
1649 		}
1650 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1651 		goto reclassify;
1652 	}
1653 #endif
1654 	return err;
1655 }
1656 EXPORT_SYMBOL(tc_classify);
1657 
1658 void tcf_destroy(struct tcf_proto *tp)
1659 {
1660 	tp->ops->destroy(tp);
1661 	module_put(tp->ops->owner);
1662 	kfree(tp);
1663 }
1664 
1665 void tcf_destroy_chain(struct tcf_proto **fl)
1666 {
1667 	struct tcf_proto *tp;
1668 
1669 	while ((tp = *fl) != NULL) {
1670 		*fl = tp->next;
1671 		tcf_destroy(tp);
1672 	}
1673 }
1674 EXPORT_SYMBOL(tcf_destroy_chain);
1675 
1676 #ifdef CONFIG_PROC_FS
1677 static int psched_show(struct seq_file *seq, void *v)
1678 {
1679 	struct timespec ts;
1680 
1681 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1682 	seq_printf(seq, "%08x %08x %08x %08x\n",
1683 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1684 		   1000000,
1685 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1686 
1687 	return 0;
1688 }
1689 
1690 static int psched_open(struct inode *inode, struct file *file)
1691 {
1692 	return single_open(file, psched_show, PDE(inode)->data);
1693 }
1694 
1695 static const struct file_operations psched_fops = {
1696 	.owner = THIS_MODULE,
1697 	.open = psched_open,
1698 	.read  = seq_read,
1699 	.llseek = seq_lseek,
1700 	.release = single_release,
1701 };
1702 #endif
1703 
1704 static int __init pktsched_init(void)
1705 {
1706 	register_qdisc(&pfifo_qdisc_ops);
1707 	register_qdisc(&bfifo_qdisc_ops);
1708 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1709 
1710 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1711 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1712 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1713 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1714 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1715 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1716 
1717 	return 0;
1718 }
1719 
1720 subsys_initcall(pktsched_init);
1721