xref: /openbmc/linux/net/sched/sch_api.c (revision 367b8112)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---requeue
101 
102    requeues once dequeued packet. It is used for non-standard or
103    just buggy devices, which can defer output even if netif_queue_stopped()=0.
104 
105    ---reset
106 
107    returns qdisc to initial state: purge all buffers, clear all
108    timers, counters (except for statistics) etc.
109 
110    ---init
111 
112    initializes newly created qdisc.
113 
114    ---destroy
115 
116    destroys resources allocated by init and during lifetime of qdisc.
117 
118    ---change
119 
120    changes qdisc parameters.
121  */
122 
123 /* Protects list of registered TC modules. It is pure SMP lock. */
124 static DEFINE_RWLOCK(qdisc_mod_lock);
125 
126 
127 /************************************************
128  *	Queueing disciplines manipulation.	*
129  ************************************************/
130 
131 
132 /* The list of all installed queueing disciplines. */
133 
134 static struct Qdisc_ops *qdisc_base;
135 
136 /* Register/uregister queueing discipline */
137 
138 int register_qdisc(struct Qdisc_ops *qops)
139 {
140 	struct Qdisc_ops *q, **qp;
141 	int rc = -EEXIST;
142 
143 	write_lock(&qdisc_mod_lock);
144 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
145 		if (!strcmp(qops->id, q->id))
146 			goto out;
147 
148 	if (qops->enqueue == NULL)
149 		qops->enqueue = noop_qdisc_ops.enqueue;
150 	if (qops->requeue == NULL)
151 		qops->requeue = noop_qdisc_ops.requeue;
152 	if (qops->dequeue == NULL)
153 		qops->dequeue = noop_qdisc_ops.dequeue;
154 
155 	qops->next = NULL;
156 	*qp = qops;
157 	rc = 0;
158 out:
159 	write_unlock(&qdisc_mod_lock);
160 	return rc;
161 }
162 EXPORT_SYMBOL(register_qdisc);
163 
164 int unregister_qdisc(struct Qdisc_ops *qops)
165 {
166 	struct Qdisc_ops *q, **qp;
167 	int err = -ENOENT;
168 
169 	write_lock(&qdisc_mod_lock);
170 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
171 		if (q == qops)
172 			break;
173 	if (q) {
174 		*qp = q->next;
175 		q->next = NULL;
176 		err = 0;
177 	}
178 	write_unlock(&qdisc_mod_lock);
179 	return err;
180 }
181 EXPORT_SYMBOL(unregister_qdisc);
182 
183 /* We know handle. Find qdisc among all qdisc's attached to device
184    (root qdisc, all its children, children of children etc.)
185  */
186 
187 struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
188 {
189 	struct Qdisc *q;
190 
191 	if (!(root->flags & TCQ_F_BUILTIN) &&
192 	    root->handle == handle)
193 		return root;
194 
195 	list_for_each_entry(q, &root->list, list) {
196 		if (q->handle == handle)
197 			return q;
198 	}
199 	return NULL;
200 }
201 
202 /*
203  * This lock is needed until some qdiscs stop calling qdisc_tree_decrease_qlen()
204  * without rtnl_lock(); currently hfsc_dequeue(), netem_dequeue(), tbf_dequeue()
205  */
206 static DEFINE_SPINLOCK(qdisc_list_lock);
207 
208 static void qdisc_list_add(struct Qdisc *q)
209 {
210 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
211 		spin_lock_bh(&qdisc_list_lock);
212 		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
213 		spin_unlock_bh(&qdisc_list_lock);
214 	}
215 }
216 
217 void qdisc_list_del(struct Qdisc *q)
218 {
219 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
220 		spin_lock_bh(&qdisc_list_lock);
221 		list_del(&q->list);
222 		spin_unlock_bh(&qdisc_list_lock);
223 	}
224 }
225 EXPORT_SYMBOL(qdisc_list_del);
226 
227 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
228 {
229 	unsigned int i;
230 	struct Qdisc *q;
231 
232 	spin_lock_bh(&qdisc_list_lock);
233 
234 	for (i = 0; i < dev->num_tx_queues; i++) {
235 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
236 		struct Qdisc *txq_root = txq->qdisc_sleeping;
237 
238 		q = qdisc_match_from_root(txq_root, handle);
239 		if (q)
240 			goto unlock;
241 	}
242 
243 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
244 
245 unlock:
246 	spin_unlock_bh(&qdisc_list_lock);
247 
248 	return q;
249 }
250 
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253 	unsigned long cl;
254 	struct Qdisc *leaf;
255 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257 	if (cops == NULL)
258 		return NULL;
259 	cl = cops->get(p, classid);
260 
261 	if (cl == 0)
262 		return NULL;
263 	leaf = cops->leaf(p, cl);
264 	cops->put(p, cl);
265 	return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272 	struct Qdisc_ops *q = NULL;
273 
274 	if (kind) {
275 		read_lock(&qdisc_mod_lock);
276 		for (q = qdisc_base; q; q = q->next) {
277 			if (nla_strcmp(kind, q->id) == 0) {
278 				if (!try_module_get(q->owner))
279 					q = NULL;
280 				break;
281 			}
282 		}
283 		read_unlock(&qdisc_mod_lock);
284 	}
285 	return q;
286 }
287 
288 static struct qdisc_rate_table *qdisc_rtab_list;
289 
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292 	struct qdisc_rate_table *rtab;
293 
294 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296 			rtab->refcnt++;
297 			return rtab;
298 		}
299 	}
300 
301 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302 	    nla_len(tab) != TC_RTAB_SIZE)
303 		return NULL;
304 
305 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306 	if (rtab) {
307 		rtab->rate = *r;
308 		rtab->refcnt = 1;
309 		memcpy(rtab->data, nla_data(tab), 1024);
310 		rtab->next = qdisc_rtab_list;
311 		qdisc_rtab_list = rtab;
312 	}
313 	return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316 
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319 	struct qdisc_rate_table *rtab, **rtabp;
320 
321 	if (!tab || --tab->refcnt)
322 		return;
323 
324 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
325 		if (rtab == tab) {
326 			*rtabp = rtab->next;
327 			kfree(rtab);
328 			return;
329 		}
330 	}
331 }
332 EXPORT_SYMBOL(qdisc_put_rtab);
333 
334 static LIST_HEAD(qdisc_stab_list);
335 static DEFINE_SPINLOCK(qdisc_stab_lock);
336 
337 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
338 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
339 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
340 };
341 
342 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
343 {
344 	struct nlattr *tb[TCA_STAB_MAX + 1];
345 	struct qdisc_size_table *stab;
346 	struct tc_sizespec *s;
347 	unsigned int tsize = 0;
348 	u16 *tab = NULL;
349 	int err;
350 
351 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
352 	if (err < 0)
353 		return ERR_PTR(err);
354 	if (!tb[TCA_STAB_BASE])
355 		return ERR_PTR(-EINVAL);
356 
357 	s = nla_data(tb[TCA_STAB_BASE]);
358 
359 	if (s->tsize > 0) {
360 		if (!tb[TCA_STAB_DATA])
361 			return ERR_PTR(-EINVAL);
362 		tab = nla_data(tb[TCA_STAB_DATA]);
363 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
364 	}
365 
366 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
367 		return ERR_PTR(-EINVAL);
368 
369 	spin_lock(&qdisc_stab_lock);
370 
371 	list_for_each_entry(stab, &qdisc_stab_list, list) {
372 		if (memcmp(&stab->szopts, s, sizeof(*s)))
373 			continue;
374 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
375 			continue;
376 		stab->refcnt++;
377 		spin_unlock(&qdisc_stab_lock);
378 		return stab;
379 	}
380 
381 	spin_unlock(&qdisc_stab_lock);
382 
383 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
384 	if (!stab)
385 		return ERR_PTR(-ENOMEM);
386 
387 	stab->refcnt = 1;
388 	stab->szopts = *s;
389 	if (tsize > 0)
390 		memcpy(stab->data, tab, tsize * sizeof(u16));
391 
392 	spin_lock(&qdisc_stab_lock);
393 	list_add_tail(&stab->list, &qdisc_stab_list);
394 	spin_unlock(&qdisc_stab_lock);
395 
396 	return stab;
397 }
398 
399 void qdisc_put_stab(struct qdisc_size_table *tab)
400 {
401 	if (!tab)
402 		return;
403 
404 	spin_lock(&qdisc_stab_lock);
405 
406 	if (--tab->refcnt == 0) {
407 		list_del(&tab->list);
408 		kfree(tab);
409 	}
410 
411 	spin_unlock(&qdisc_stab_lock);
412 }
413 EXPORT_SYMBOL(qdisc_put_stab);
414 
415 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
416 {
417 	struct nlattr *nest;
418 
419 	nest = nla_nest_start(skb, TCA_STAB);
420 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
421 	nla_nest_end(skb, nest);
422 
423 	return skb->len;
424 
425 nla_put_failure:
426 	return -1;
427 }
428 
429 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
430 {
431 	int pkt_len, slot;
432 
433 	pkt_len = skb->len + stab->szopts.overhead;
434 	if (unlikely(!stab->szopts.tsize))
435 		goto out;
436 
437 	slot = pkt_len + stab->szopts.cell_align;
438 	if (unlikely(slot < 0))
439 		slot = 0;
440 
441 	slot >>= stab->szopts.cell_log;
442 	if (likely(slot < stab->szopts.tsize))
443 		pkt_len = stab->data[slot];
444 	else
445 		pkt_len = stab->data[stab->szopts.tsize - 1] *
446 				(slot / stab->szopts.tsize) +
447 				stab->data[slot % stab->szopts.tsize];
448 
449 	pkt_len <<= stab->szopts.size_log;
450 out:
451 	if (unlikely(pkt_len < 1))
452 		pkt_len = 1;
453 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
454 }
455 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
456 
457 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
458 {
459 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
460 						 timer);
461 
462 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
463 	smp_wmb();
464 	__netif_schedule(qdisc_root(wd->qdisc));
465 
466 	return HRTIMER_NORESTART;
467 }
468 
469 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
470 {
471 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
472 	wd->timer.function = qdisc_watchdog;
473 	wd->qdisc = qdisc;
474 }
475 EXPORT_SYMBOL(qdisc_watchdog_init);
476 
477 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
478 {
479 	ktime_t time;
480 
481 	if (test_bit(__QDISC_STATE_DEACTIVATED,
482 		     &qdisc_root_sleeping(wd->qdisc)->state))
483 		return;
484 
485 	wd->qdisc->flags |= TCQ_F_THROTTLED;
486 	time = ktime_set(0, 0);
487 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
488 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
489 }
490 EXPORT_SYMBOL(qdisc_watchdog_schedule);
491 
492 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
493 {
494 	hrtimer_cancel(&wd->timer);
495 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
496 }
497 EXPORT_SYMBOL(qdisc_watchdog_cancel);
498 
499 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
500 {
501 	unsigned int size = n * sizeof(struct hlist_head), i;
502 	struct hlist_head *h;
503 
504 	if (size <= PAGE_SIZE)
505 		h = kmalloc(size, GFP_KERNEL);
506 	else
507 		h = (struct hlist_head *)
508 			__get_free_pages(GFP_KERNEL, get_order(size));
509 
510 	if (h != NULL) {
511 		for (i = 0; i < n; i++)
512 			INIT_HLIST_HEAD(&h[i]);
513 	}
514 	return h;
515 }
516 
517 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
518 {
519 	unsigned int size = n * sizeof(struct hlist_head);
520 
521 	if (size <= PAGE_SIZE)
522 		kfree(h);
523 	else
524 		free_pages((unsigned long)h, get_order(size));
525 }
526 
527 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
528 {
529 	struct Qdisc_class_common *cl;
530 	struct hlist_node *n, *next;
531 	struct hlist_head *nhash, *ohash;
532 	unsigned int nsize, nmask, osize;
533 	unsigned int i, h;
534 
535 	/* Rehash when load factor exceeds 0.75 */
536 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
537 		return;
538 	nsize = clhash->hashsize * 2;
539 	nmask = nsize - 1;
540 	nhash = qdisc_class_hash_alloc(nsize);
541 	if (nhash == NULL)
542 		return;
543 
544 	ohash = clhash->hash;
545 	osize = clhash->hashsize;
546 
547 	sch_tree_lock(sch);
548 	for (i = 0; i < osize; i++) {
549 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
550 			h = qdisc_class_hash(cl->classid, nmask);
551 			hlist_add_head(&cl->hnode, &nhash[h]);
552 		}
553 	}
554 	clhash->hash     = nhash;
555 	clhash->hashsize = nsize;
556 	clhash->hashmask = nmask;
557 	sch_tree_unlock(sch);
558 
559 	qdisc_class_hash_free(ohash, osize);
560 }
561 EXPORT_SYMBOL(qdisc_class_hash_grow);
562 
563 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
564 {
565 	unsigned int size = 4;
566 
567 	clhash->hash = qdisc_class_hash_alloc(size);
568 	if (clhash->hash == NULL)
569 		return -ENOMEM;
570 	clhash->hashsize  = size;
571 	clhash->hashmask  = size - 1;
572 	clhash->hashelems = 0;
573 	return 0;
574 }
575 EXPORT_SYMBOL(qdisc_class_hash_init);
576 
577 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
578 {
579 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
580 }
581 EXPORT_SYMBOL(qdisc_class_hash_destroy);
582 
583 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
584 			     struct Qdisc_class_common *cl)
585 {
586 	unsigned int h;
587 
588 	INIT_HLIST_NODE(&cl->hnode);
589 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
590 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
591 	clhash->hashelems++;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_insert);
594 
595 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
596 			     struct Qdisc_class_common *cl)
597 {
598 	hlist_del(&cl->hnode);
599 	clhash->hashelems--;
600 }
601 EXPORT_SYMBOL(qdisc_class_hash_remove);
602 
603 /* Allocate an unique handle from space managed by kernel */
604 
605 static u32 qdisc_alloc_handle(struct net_device *dev)
606 {
607 	int i = 0x10000;
608 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
609 
610 	do {
611 		autohandle += TC_H_MAKE(0x10000U, 0);
612 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
613 			autohandle = TC_H_MAKE(0x80000000U, 0);
614 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
615 
616 	return i>0 ? autohandle : 0;
617 }
618 
619 /* Attach toplevel qdisc to device queue. */
620 
621 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
622 				     struct Qdisc *qdisc)
623 {
624 	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
625 	spinlock_t *root_lock;
626 
627 	root_lock = qdisc_lock(oqdisc);
628 	spin_lock_bh(root_lock);
629 
630 	/* Prune old scheduler */
631 	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
632 		qdisc_reset(oqdisc);
633 
634 	/* ... and graft new one */
635 	if (qdisc == NULL)
636 		qdisc = &noop_qdisc;
637 	dev_queue->qdisc_sleeping = qdisc;
638 	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
639 
640 	spin_unlock_bh(root_lock);
641 
642 	return oqdisc;
643 }
644 
645 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
646 {
647 	const struct Qdisc_class_ops *cops;
648 	unsigned long cl;
649 	u32 parentid;
650 
651 	if (n == 0)
652 		return;
653 	while ((parentid = sch->parent)) {
654 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
655 			return;
656 
657 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
658 		if (sch == NULL) {
659 			WARN_ON(parentid != TC_H_ROOT);
660 			return;
661 		}
662 		cops = sch->ops->cl_ops;
663 		if (cops->qlen_notify) {
664 			cl = cops->get(sch, parentid);
665 			cops->qlen_notify(sch, cl);
666 			cops->put(sch, cl);
667 		}
668 		sch->q.qlen -= n;
669 	}
670 }
671 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
672 
673 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
674 			       struct Qdisc *old, struct Qdisc *new)
675 {
676 	if (new || old)
677 		qdisc_notify(skb, n, clid, old, new);
678 
679 	if (old)
680 		qdisc_destroy(old);
681 }
682 
683 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
684  * to device "dev".
685  *
686  * When appropriate send a netlink notification using 'skb'
687  * and "n".
688  *
689  * On success, destroy old qdisc.
690  */
691 
692 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
693 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
694 		       struct Qdisc *new, struct Qdisc *old)
695 {
696 	struct Qdisc *q = old;
697 	int err = 0;
698 
699 	if (parent == NULL) {
700 		unsigned int i, num_q, ingress;
701 
702 		ingress = 0;
703 		num_q = dev->num_tx_queues;
704 		if ((q && q->flags & TCQ_F_INGRESS) ||
705 		    (new && new->flags & TCQ_F_INGRESS)) {
706 			num_q = 1;
707 			ingress = 1;
708 		}
709 
710 		if (dev->flags & IFF_UP)
711 			dev_deactivate(dev);
712 
713 		for (i = 0; i < num_q; i++) {
714 			struct netdev_queue *dev_queue = &dev->rx_queue;
715 
716 			if (!ingress)
717 				dev_queue = netdev_get_tx_queue(dev, i);
718 
719 			old = dev_graft_qdisc(dev_queue, new);
720 			if (new && i > 0)
721 				atomic_inc(&new->refcnt);
722 
723 			notify_and_destroy(skb, n, classid, old, new);
724 		}
725 
726 		if (dev->flags & IFF_UP)
727 			dev_activate(dev);
728 	} else {
729 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
730 
731 		err = -EINVAL;
732 
733 		if (cops) {
734 			unsigned long cl = cops->get(parent, classid);
735 			if (cl) {
736 				err = cops->graft(parent, cl, new, &old);
737 				cops->put(parent, cl);
738 			}
739 		}
740 		if (!err)
741 			notify_and_destroy(skb, n, classid, old, new);
742 	}
743 	return err;
744 }
745 
746 /* lockdep annotation is needed for ingress; egress gets it only for name */
747 static struct lock_class_key qdisc_tx_lock;
748 static struct lock_class_key qdisc_rx_lock;
749 
750 /*
751    Allocate and initialize new qdisc.
752 
753    Parameters are passed via opt.
754  */
755 
756 static struct Qdisc *
757 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
758 	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
759 {
760 	int err;
761 	struct nlattr *kind = tca[TCA_KIND];
762 	struct Qdisc *sch;
763 	struct Qdisc_ops *ops;
764 	struct qdisc_size_table *stab;
765 
766 	ops = qdisc_lookup_ops(kind);
767 #ifdef CONFIG_MODULES
768 	if (ops == NULL && kind != NULL) {
769 		char name[IFNAMSIZ];
770 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
771 			/* We dropped the RTNL semaphore in order to
772 			 * perform the module load.  So, even if we
773 			 * succeeded in loading the module we have to
774 			 * tell the caller to replay the request.  We
775 			 * indicate this using -EAGAIN.
776 			 * We replay the request because the device may
777 			 * go away in the mean time.
778 			 */
779 			rtnl_unlock();
780 			request_module("sch_%s", name);
781 			rtnl_lock();
782 			ops = qdisc_lookup_ops(kind);
783 			if (ops != NULL) {
784 				/* We will try again qdisc_lookup_ops,
785 				 * so don't keep a reference.
786 				 */
787 				module_put(ops->owner);
788 				err = -EAGAIN;
789 				goto err_out;
790 			}
791 		}
792 	}
793 #endif
794 
795 	err = -ENOENT;
796 	if (ops == NULL)
797 		goto err_out;
798 
799 	sch = qdisc_alloc(dev_queue, ops);
800 	if (IS_ERR(sch)) {
801 		err = PTR_ERR(sch);
802 		goto err_out2;
803 	}
804 
805 	sch->parent = parent;
806 
807 	if (handle == TC_H_INGRESS) {
808 		sch->flags |= TCQ_F_INGRESS;
809 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
810 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
811 	} else {
812 		if (handle == 0) {
813 			handle = qdisc_alloc_handle(dev);
814 			err = -ENOMEM;
815 			if (handle == 0)
816 				goto err_out3;
817 		}
818 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
819 	}
820 
821 	sch->handle = handle;
822 
823 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
824 		if (tca[TCA_STAB]) {
825 			stab = qdisc_get_stab(tca[TCA_STAB]);
826 			if (IS_ERR(stab)) {
827 				err = PTR_ERR(stab);
828 				goto err_out3;
829 			}
830 			sch->stab = stab;
831 		}
832 		if (tca[TCA_RATE]) {
833 			spinlock_t *root_lock;
834 
835 			if ((sch->parent != TC_H_ROOT) &&
836 			    !(sch->flags & TCQ_F_INGRESS))
837 				root_lock = qdisc_root_sleeping_lock(sch);
838 			else
839 				root_lock = qdisc_lock(sch);
840 
841 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
842 						root_lock, tca[TCA_RATE]);
843 			if (err) {
844 				/*
845 				 * Any broken qdiscs that would require
846 				 * a ops->reset() here? The qdisc was never
847 				 * in action so it shouldn't be necessary.
848 				 */
849 				if (ops->destroy)
850 					ops->destroy(sch);
851 				goto err_out3;
852 			}
853 		}
854 
855 		qdisc_list_add(sch);
856 
857 		return sch;
858 	}
859 err_out3:
860 	qdisc_put_stab(sch->stab);
861 	dev_put(dev);
862 	kfree((char *) sch - sch->padded);
863 err_out2:
864 	module_put(ops->owner);
865 err_out:
866 	*errp = err;
867 	return NULL;
868 }
869 
870 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
871 {
872 	struct qdisc_size_table *stab = NULL;
873 	int err = 0;
874 
875 	if (tca[TCA_OPTIONS]) {
876 		if (sch->ops->change == NULL)
877 			return -EINVAL;
878 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
879 		if (err)
880 			return err;
881 	}
882 
883 	if (tca[TCA_STAB]) {
884 		stab = qdisc_get_stab(tca[TCA_STAB]);
885 		if (IS_ERR(stab))
886 			return PTR_ERR(stab);
887 	}
888 
889 	qdisc_put_stab(sch->stab);
890 	sch->stab = stab;
891 
892 	if (tca[TCA_RATE])
893 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
894 				      qdisc_root_sleeping_lock(sch),
895 				      tca[TCA_RATE]);
896 	return 0;
897 }
898 
899 struct check_loop_arg
900 {
901 	struct qdisc_walker 	w;
902 	struct Qdisc		*p;
903 	int			depth;
904 };
905 
906 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
907 
908 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
909 {
910 	struct check_loop_arg	arg;
911 
912 	if (q->ops->cl_ops == NULL)
913 		return 0;
914 
915 	arg.w.stop = arg.w.skip = arg.w.count = 0;
916 	arg.w.fn = check_loop_fn;
917 	arg.depth = depth;
918 	arg.p = p;
919 	q->ops->cl_ops->walk(q, &arg.w);
920 	return arg.w.stop ? -ELOOP : 0;
921 }
922 
923 static int
924 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
925 {
926 	struct Qdisc *leaf;
927 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
928 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
929 
930 	leaf = cops->leaf(q, cl);
931 	if (leaf) {
932 		if (leaf == arg->p || arg->depth > 7)
933 			return -ELOOP;
934 		return check_loop(leaf, arg->p, arg->depth + 1);
935 	}
936 	return 0;
937 }
938 
939 /*
940  * Delete/get qdisc.
941  */
942 
943 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
944 {
945 	struct net *net = sock_net(skb->sk);
946 	struct tcmsg *tcm = NLMSG_DATA(n);
947 	struct nlattr *tca[TCA_MAX + 1];
948 	struct net_device *dev;
949 	u32 clid = tcm->tcm_parent;
950 	struct Qdisc *q = NULL;
951 	struct Qdisc *p = NULL;
952 	int err;
953 
954 	if (net != &init_net)
955 		return -EINVAL;
956 
957 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
958 		return -ENODEV;
959 
960 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
961 	if (err < 0)
962 		return err;
963 
964 	if (clid) {
965 		if (clid != TC_H_ROOT) {
966 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
967 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
968 					return -ENOENT;
969 				q = qdisc_leaf(p, clid);
970 			} else { /* ingress */
971 				q = dev->rx_queue.qdisc_sleeping;
972 			}
973 		} else {
974 			struct netdev_queue *dev_queue;
975 			dev_queue = netdev_get_tx_queue(dev, 0);
976 			q = dev_queue->qdisc_sleeping;
977 		}
978 		if (!q)
979 			return -ENOENT;
980 
981 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
982 			return -EINVAL;
983 	} else {
984 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
985 			return -ENOENT;
986 	}
987 
988 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
989 		return -EINVAL;
990 
991 	if (n->nlmsg_type == RTM_DELQDISC) {
992 		if (!clid)
993 			return -EINVAL;
994 		if (q->handle == 0)
995 			return -ENOENT;
996 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
997 			return err;
998 	} else {
999 		qdisc_notify(skb, n, clid, NULL, q);
1000 	}
1001 	return 0;
1002 }
1003 
1004 /*
1005    Create/change qdisc.
1006  */
1007 
1008 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1009 {
1010 	struct net *net = sock_net(skb->sk);
1011 	struct tcmsg *tcm;
1012 	struct nlattr *tca[TCA_MAX + 1];
1013 	struct net_device *dev;
1014 	u32 clid;
1015 	struct Qdisc *q, *p;
1016 	int err;
1017 
1018 	if (net != &init_net)
1019 		return -EINVAL;
1020 
1021 replay:
1022 	/* Reinit, just in case something touches this. */
1023 	tcm = NLMSG_DATA(n);
1024 	clid = tcm->tcm_parent;
1025 	q = p = NULL;
1026 
1027 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1028 		return -ENODEV;
1029 
1030 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1031 	if (err < 0)
1032 		return err;
1033 
1034 	if (clid) {
1035 		if (clid != TC_H_ROOT) {
1036 			if (clid != TC_H_INGRESS) {
1037 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1038 					return -ENOENT;
1039 				q = qdisc_leaf(p, clid);
1040 			} else { /*ingress */
1041 				q = dev->rx_queue.qdisc_sleeping;
1042 			}
1043 		} else {
1044 			struct netdev_queue *dev_queue;
1045 			dev_queue = netdev_get_tx_queue(dev, 0);
1046 			q = dev_queue->qdisc_sleeping;
1047 		}
1048 
1049 		/* It may be default qdisc, ignore it */
1050 		if (q && q->handle == 0)
1051 			q = NULL;
1052 
1053 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1054 			if (tcm->tcm_handle) {
1055 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1056 					return -EEXIST;
1057 				if (TC_H_MIN(tcm->tcm_handle))
1058 					return -EINVAL;
1059 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1060 					goto create_n_graft;
1061 				if (n->nlmsg_flags&NLM_F_EXCL)
1062 					return -EEXIST;
1063 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1064 					return -EINVAL;
1065 				if (q == p ||
1066 				    (p && check_loop(q, p, 0)))
1067 					return -ELOOP;
1068 				atomic_inc(&q->refcnt);
1069 				goto graft;
1070 			} else {
1071 				if (q == NULL)
1072 					goto create_n_graft;
1073 
1074 				/* This magic test requires explanation.
1075 				 *
1076 				 *   We know, that some child q is already
1077 				 *   attached to this parent and have choice:
1078 				 *   either to change it or to create/graft new one.
1079 				 *
1080 				 *   1. We are allowed to create/graft only
1081 				 *   if CREATE and REPLACE flags are set.
1082 				 *
1083 				 *   2. If EXCL is set, requestor wanted to say,
1084 				 *   that qdisc tcm_handle is not expected
1085 				 *   to exist, so that we choose create/graft too.
1086 				 *
1087 				 *   3. The last case is when no flags are set.
1088 				 *   Alas, it is sort of hole in API, we
1089 				 *   cannot decide what to do unambiguously.
1090 				 *   For now we select create/graft, if
1091 				 *   user gave KIND, which does not match existing.
1092 				 */
1093 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1094 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1095 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1096 				     (tca[TCA_KIND] &&
1097 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1098 					goto create_n_graft;
1099 			}
1100 		}
1101 	} else {
1102 		if (!tcm->tcm_handle)
1103 			return -EINVAL;
1104 		q = qdisc_lookup(dev, tcm->tcm_handle);
1105 	}
1106 
1107 	/* Change qdisc parameters */
1108 	if (q == NULL)
1109 		return -ENOENT;
1110 	if (n->nlmsg_flags&NLM_F_EXCL)
1111 		return -EEXIST;
1112 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1113 		return -EINVAL;
1114 	err = qdisc_change(q, tca);
1115 	if (err == 0)
1116 		qdisc_notify(skb, n, clid, NULL, q);
1117 	return err;
1118 
1119 create_n_graft:
1120 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1121 		return -ENOENT;
1122 	if (clid == TC_H_INGRESS)
1123 		q = qdisc_create(dev, &dev->rx_queue,
1124 				 tcm->tcm_parent, tcm->tcm_parent,
1125 				 tca, &err);
1126 	else
1127 		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1128 				 tcm->tcm_parent, tcm->tcm_handle,
1129 				 tca, &err);
1130 	if (q == NULL) {
1131 		if (err == -EAGAIN)
1132 			goto replay;
1133 		return err;
1134 	}
1135 
1136 graft:
1137 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1138 	if (err) {
1139 		if (q)
1140 			qdisc_destroy(q);
1141 		return err;
1142 	}
1143 
1144 	return 0;
1145 }
1146 
1147 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1148 			 u32 pid, u32 seq, u16 flags, int event)
1149 {
1150 	struct tcmsg *tcm;
1151 	struct nlmsghdr  *nlh;
1152 	unsigned char *b = skb_tail_pointer(skb);
1153 	struct gnet_dump d;
1154 
1155 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1156 	tcm = NLMSG_DATA(nlh);
1157 	tcm->tcm_family = AF_UNSPEC;
1158 	tcm->tcm__pad1 = 0;
1159 	tcm->tcm__pad2 = 0;
1160 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1161 	tcm->tcm_parent = clid;
1162 	tcm->tcm_handle = q->handle;
1163 	tcm->tcm_info = atomic_read(&q->refcnt);
1164 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1165 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1166 		goto nla_put_failure;
1167 	q->qstats.qlen = q->q.qlen;
1168 
1169 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1170 		goto nla_put_failure;
1171 
1172 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1173 					 qdisc_root_sleeping_lock(q), &d) < 0)
1174 		goto nla_put_failure;
1175 
1176 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1177 		goto nla_put_failure;
1178 
1179 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1180 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1181 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1182 		goto nla_put_failure;
1183 
1184 	if (gnet_stats_finish_copy(&d) < 0)
1185 		goto nla_put_failure;
1186 
1187 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1188 	return skb->len;
1189 
1190 nlmsg_failure:
1191 nla_put_failure:
1192 	nlmsg_trim(skb, b);
1193 	return -1;
1194 }
1195 
1196 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1197 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1198 {
1199 	struct sk_buff *skb;
1200 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1201 
1202 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1203 	if (!skb)
1204 		return -ENOBUFS;
1205 
1206 	if (old && old->handle) {
1207 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1208 			goto err_out;
1209 	}
1210 	if (new) {
1211 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1212 			goto err_out;
1213 	}
1214 
1215 	if (skb->len)
1216 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1217 
1218 err_out:
1219 	kfree_skb(skb);
1220 	return -EINVAL;
1221 }
1222 
1223 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1224 {
1225 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1226 }
1227 
1228 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1229 			      struct netlink_callback *cb,
1230 			      int *q_idx_p, int s_q_idx)
1231 {
1232 	int ret = 0, q_idx = *q_idx_p;
1233 	struct Qdisc *q;
1234 
1235 	if (!root)
1236 		return 0;
1237 
1238 	q = root;
1239 	if (q_idx < s_q_idx) {
1240 		q_idx++;
1241 	} else {
1242 		if (!tc_qdisc_dump_ignore(q) &&
1243 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1244 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1245 			goto done;
1246 		q_idx++;
1247 	}
1248 	list_for_each_entry(q, &root->list, list) {
1249 		if (q_idx < s_q_idx) {
1250 			q_idx++;
1251 			continue;
1252 		}
1253 		if (!tc_qdisc_dump_ignore(q) &&
1254 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1255 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1256 			goto done;
1257 		q_idx++;
1258 	}
1259 
1260 out:
1261 	*q_idx_p = q_idx;
1262 	return ret;
1263 done:
1264 	ret = -1;
1265 	goto out;
1266 }
1267 
1268 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1269 {
1270 	struct net *net = sock_net(skb->sk);
1271 	int idx, q_idx;
1272 	int s_idx, s_q_idx;
1273 	struct net_device *dev;
1274 
1275 	if (net != &init_net)
1276 		return 0;
1277 
1278 	s_idx = cb->args[0];
1279 	s_q_idx = q_idx = cb->args[1];
1280 	read_lock(&dev_base_lock);
1281 	idx = 0;
1282 	for_each_netdev(&init_net, dev) {
1283 		struct netdev_queue *dev_queue;
1284 
1285 		if (idx < s_idx)
1286 			goto cont;
1287 		if (idx > s_idx)
1288 			s_q_idx = 0;
1289 		q_idx = 0;
1290 
1291 		dev_queue = netdev_get_tx_queue(dev, 0);
1292 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1293 			goto done;
1294 
1295 		dev_queue = &dev->rx_queue;
1296 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1297 			goto done;
1298 
1299 cont:
1300 		idx++;
1301 	}
1302 
1303 done:
1304 	read_unlock(&dev_base_lock);
1305 
1306 	cb->args[0] = idx;
1307 	cb->args[1] = q_idx;
1308 
1309 	return skb->len;
1310 }
1311 
1312 
1313 
1314 /************************************************
1315  *	Traffic classes manipulation.		*
1316  ************************************************/
1317 
1318 
1319 
1320 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1321 {
1322 	struct net *net = sock_net(skb->sk);
1323 	struct netdev_queue *dev_queue;
1324 	struct tcmsg *tcm = NLMSG_DATA(n);
1325 	struct nlattr *tca[TCA_MAX + 1];
1326 	struct net_device *dev;
1327 	struct Qdisc *q = NULL;
1328 	const struct Qdisc_class_ops *cops;
1329 	unsigned long cl = 0;
1330 	unsigned long new_cl;
1331 	u32 pid = tcm->tcm_parent;
1332 	u32 clid = tcm->tcm_handle;
1333 	u32 qid = TC_H_MAJ(clid);
1334 	int err;
1335 
1336 	if (net != &init_net)
1337 		return -EINVAL;
1338 
1339 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1340 		return -ENODEV;
1341 
1342 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1343 	if (err < 0)
1344 		return err;
1345 
1346 	/*
1347 	   parent == TC_H_UNSPEC - unspecified parent.
1348 	   parent == TC_H_ROOT   - class is root, which has no parent.
1349 	   parent == X:0	 - parent is root class.
1350 	   parent == X:Y	 - parent is a node in hierarchy.
1351 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1352 
1353 	   handle == 0:0	 - generate handle from kernel pool.
1354 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1355 	   handle == X:Y	 - clear.
1356 	   handle == X:0	 - root class.
1357 	 */
1358 
1359 	/* Step 1. Determine qdisc handle X:0 */
1360 
1361 	dev_queue = netdev_get_tx_queue(dev, 0);
1362 	if (pid != TC_H_ROOT) {
1363 		u32 qid1 = TC_H_MAJ(pid);
1364 
1365 		if (qid && qid1) {
1366 			/* If both majors are known, they must be identical. */
1367 			if (qid != qid1)
1368 				return -EINVAL;
1369 		} else if (qid1) {
1370 			qid = qid1;
1371 		} else if (qid == 0)
1372 			qid = dev_queue->qdisc_sleeping->handle;
1373 
1374 		/* Now qid is genuine qdisc handle consistent
1375 		   both with parent and child.
1376 
1377 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1378 		 */
1379 		if (pid)
1380 			pid = TC_H_MAKE(qid, pid);
1381 	} else {
1382 		if (qid == 0)
1383 			qid = dev_queue->qdisc_sleeping->handle;
1384 	}
1385 
1386 	/* OK. Locate qdisc */
1387 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1388 		return -ENOENT;
1389 
1390 	/* An check that it supports classes */
1391 	cops = q->ops->cl_ops;
1392 	if (cops == NULL)
1393 		return -EINVAL;
1394 
1395 	/* Now try to get class */
1396 	if (clid == 0) {
1397 		if (pid == TC_H_ROOT)
1398 			clid = qid;
1399 	} else
1400 		clid = TC_H_MAKE(qid, clid);
1401 
1402 	if (clid)
1403 		cl = cops->get(q, clid);
1404 
1405 	if (cl == 0) {
1406 		err = -ENOENT;
1407 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1408 			goto out;
1409 	} else {
1410 		switch (n->nlmsg_type) {
1411 		case RTM_NEWTCLASS:
1412 			err = -EEXIST;
1413 			if (n->nlmsg_flags&NLM_F_EXCL)
1414 				goto out;
1415 			break;
1416 		case RTM_DELTCLASS:
1417 			err = cops->delete(q, cl);
1418 			if (err == 0)
1419 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1420 			goto out;
1421 		case RTM_GETTCLASS:
1422 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1423 			goto out;
1424 		default:
1425 			err = -EINVAL;
1426 			goto out;
1427 		}
1428 	}
1429 
1430 	new_cl = cl;
1431 	err = cops->change(q, clid, pid, tca, &new_cl);
1432 	if (err == 0)
1433 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1434 
1435 out:
1436 	if (cl)
1437 		cops->put(q, cl);
1438 
1439 	return err;
1440 }
1441 
1442 
1443 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1444 			  unsigned long cl,
1445 			  u32 pid, u32 seq, u16 flags, int event)
1446 {
1447 	struct tcmsg *tcm;
1448 	struct nlmsghdr  *nlh;
1449 	unsigned char *b = skb_tail_pointer(skb);
1450 	struct gnet_dump d;
1451 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1452 
1453 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1454 	tcm = NLMSG_DATA(nlh);
1455 	tcm->tcm_family = AF_UNSPEC;
1456 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1457 	tcm->tcm_parent = q->handle;
1458 	tcm->tcm_handle = q->handle;
1459 	tcm->tcm_info = 0;
1460 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1461 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1462 		goto nla_put_failure;
1463 
1464 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1465 					 qdisc_root_sleeping_lock(q), &d) < 0)
1466 		goto nla_put_failure;
1467 
1468 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1469 		goto nla_put_failure;
1470 
1471 	if (gnet_stats_finish_copy(&d) < 0)
1472 		goto nla_put_failure;
1473 
1474 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1475 	return skb->len;
1476 
1477 nlmsg_failure:
1478 nla_put_failure:
1479 	nlmsg_trim(skb, b);
1480 	return -1;
1481 }
1482 
1483 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1484 			  struct Qdisc *q, unsigned long cl, int event)
1485 {
1486 	struct sk_buff *skb;
1487 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1488 
1489 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1490 	if (!skb)
1491 		return -ENOBUFS;
1492 
1493 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1494 		kfree_skb(skb);
1495 		return -EINVAL;
1496 	}
1497 
1498 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1499 }
1500 
1501 struct qdisc_dump_args
1502 {
1503 	struct qdisc_walker w;
1504 	struct sk_buff *skb;
1505 	struct netlink_callback *cb;
1506 };
1507 
1508 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1509 {
1510 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1511 
1512 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1513 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1514 }
1515 
1516 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1517 				struct tcmsg *tcm, struct netlink_callback *cb,
1518 				int *t_p, int s_t)
1519 {
1520 	struct qdisc_dump_args arg;
1521 
1522 	if (tc_qdisc_dump_ignore(q) ||
1523 	    *t_p < s_t || !q->ops->cl_ops ||
1524 	    (tcm->tcm_parent &&
1525 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1526 		(*t_p)++;
1527 		return 0;
1528 	}
1529 	if (*t_p > s_t)
1530 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1531 	arg.w.fn = qdisc_class_dump;
1532 	arg.skb = skb;
1533 	arg.cb = cb;
1534 	arg.w.stop  = 0;
1535 	arg.w.skip = cb->args[1];
1536 	arg.w.count = 0;
1537 	q->ops->cl_ops->walk(q, &arg.w);
1538 	cb->args[1] = arg.w.count;
1539 	if (arg.w.stop)
1540 		return -1;
1541 	(*t_p)++;
1542 	return 0;
1543 }
1544 
1545 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1546 			       struct tcmsg *tcm, struct netlink_callback *cb,
1547 			       int *t_p, int s_t)
1548 {
1549 	struct Qdisc *q;
1550 
1551 	if (!root)
1552 		return 0;
1553 
1554 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1555 		return -1;
1556 
1557 	list_for_each_entry(q, &root->list, list) {
1558 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1559 			return -1;
1560 	}
1561 
1562 	return 0;
1563 }
1564 
1565 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1566 {
1567 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1568 	struct net *net = sock_net(skb->sk);
1569 	struct netdev_queue *dev_queue;
1570 	struct net_device *dev;
1571 	int t, s_t;
1572 
1573 	if (net != &init_net)
1574 		return 0;
1575 
1576 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1577 		return 0;
1578 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1579 		return 0;
1580 
1581 	s_t = cb->args[0];
1582 	t = 0;
1583 
1584 	dev_queue = netdev_get_tx_queue(dev, 0);
1585 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1586 		goto done;
1587 
1588 	dev_queue = &dev->rx_queue;
1589 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1590 		goto done;
1591 
1592 done:
1593 	cb->args[0] = t;
1594 
1595 	dev_put(dev);
1596 	return skb->len;
1597 }
1598 
1599 /* Main classifier routine: scans classifier chain attached
1600    to this qdisc, (optionally) tests for protocol and asks
1601    specific classifiers.
1602  */
1603 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1604 		       struct tcf_result *res)
1605 {
1606 	__be16 protocol = skb->protocol;
1607 	int err = 0;
1608 
1609 	for (; tp; tp = tp->next) {
1610 		if ((tp->protocol == protocol ||
1611 		     tp->protocol == htons(ETH_P_ALL)) &&
1612 		    (err = tp->classify(skb, tp, res)) >= 0) {
1613 #ifdef CONFIG_NET_CLS_ACT
1614 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1615 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1616 #endif
1617 			return err;
1618 		}
1619 	}
1620 	return -1;
1621 }
1622 EXPORT_SYMBOL(tc_classify_compat);
1623 
1624 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1625 		struct tcf_result *res)
1626 {
1627 	int err = 0;
1628 	__be16 protocol;
1629 #ifdef CONFIG_NET_CLS_ACT
1630 	struct tcf_proto *otp = tp;
1631 reclassify:
1632 #endif
1633 	protocol = skb->protocol;
1634 
1635 	err = tc_classify_compat(skb, tp, res);
1636 #ifdef CONFIG_NET_CLS_ACT
1637 	if (err == TC_ACT_RECLASSIFY) {
1638 		u32 verd = G_TC_VERD(skb->tc_verd);
1639 		tp = otp;
1640 
1641 		if (verd++ >= MAX_REC_LOOP) {
1642 			printk("rule prio %u protocol %02x reclassify loop, "
1643 			       "packet dropped\n",
1644 			       tp->prio&0xffff, ntohs(tp->protocol));
1645 			return TC_ACT_SHOT;
1646 		}
1647 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1648 		goto reclassify;
1649 	}
1650 #endif
1651 	return err;
1652 }
1653 EXPORT_SYMBOL(tc_classify);
1654 
1655 void tcf_destroy(struct tcf_proto *tp)
1656 {
1657 	tp->ops->destroy(tp);
1658 	module_put(tp->ops->owner);
1659 	kfree(tp);
1660 }
1661 
1662 void tcf_destroy_chain(struct tcf_proto **fl)
1663 {
1664 	struct tcf_proto *tp;
1665 
1666 	while ((tp = *fl) != NULL) {
1667 		*fl = tp->next;
1668 		tcf_destroy(tp);
1669 	}
1670 }
1671 EXPORT_SYMBOL(tcf_destroy_chain);
1672 
1673 #ifdef CONFIG_PROC_FS
1674 static int psched_show(struct seq_file *seq, void *v)
1675 {
1676 	struct timespec ts;
1677 
1678 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1679 	seq_printf(seq, "%08x %08x %08x %08x\n",
1680 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1681 		   1000000,
1682 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1683 
1684 	return 0;
1685 }
1686 
1687 static int psched_open(struct inode *inode, struct file *file)
1688 {
1689 	return single_open(file, psched_show, PDE(inode)->data);
1690 }
1691 
1692 static const struct file_operations psched_fops = {
1693 	.owner = THIS_MODULE,
1694 	.open = psched_open,
1695 	.read  = seq_read,
1696 	.llseek = seq_lseek,
1697 	.release = single_release,
1698 };
1699 #endif
1700 
1701 static int __init pktsched_init(void)
1702 {
1703 	register_qdisc(&pfifo_qdisc_ops);
1704 	register_qdisc(&bfifo_qdisc_ops);
1705 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1706 
1707 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1708 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1709 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1710 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1711 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1712 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1713 
1714 	return 0;
1715 }
1716 
1717 subsys_initcall(pktsched_init);
1718