xref: /openbmc/linux/net/sched/sch_api.c (revision 483eb062)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 			struct nlmsghdr *n, u32 clid,
40 			struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 			 struct nlmsghdr *n, struct Qdisc *q,
43 			 unsigned long cl, int event);
44 
45 /*
46 
47    Short review.
48    -------------
49 
50    This file consists of two interrelated parts:
51 
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54 
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59 
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64 
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67 
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73 
74    All real intelligent work is done inside qdisc modules.
75 
76 
77 
78    Every discipline has two major routines: enqueue and dequeue.
79 
80    ---dequeue
81 
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88 
89    ---enqueue
90 
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP 	- this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED	- dropped by police.
99      Expected action: backoff or error to real-time apps.
100 
101    Auxiliary routines:
102 
103    ---peek
104 
105    like dequeue but without removing a packet from the queue
106 
107    ---reset
108 
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111 
112    ---init
113 
114    initializes newly created qdisc.
115 
116    ---destroy
117 
118    destroys resources allocated by init and during lifetime of qdisc.
119 
120    ---change
121 
122    changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  *	Queueing disciplines manipulation.	*
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/unregister queueing discipline */
139 
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142 	struct Qdisc_ops *q, **qp;
143 	int rc = -EEXIST;
144 
145 	write_lock(&qdisc_mod_lock);
146 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 		if (!strcmp(qops->id, q->id))
148 			goto out;
149 
150 	if (qops->enqueue == NULL)
151 		qops->enqueue = noop_qdisc_ops.enqueue;
152 	if (qops->peek == NULL) {
153 		if (qops->dequeue == NULL)
154 			qops->peek = noop_qdisc_ops.peek;
155 		else
156 			goto out_einval;
157 	}
158 	if (qops->dequeue == NULL)
159 		qops->dequeue = noop_qdisc_ops.dequeue;
160 
161 	if (qops->cl_ops) {
162 		const struct Qdisc_class_ops *cops = qops->cl_ops;
163 
164 		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165 			goto out_einval;
166 
167 		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168 			goto out_einval;
169 	}
170 
171 	qops->next = NULL;
172 	*qp = qops;
173 	rc = 0;
174 out:
175 	write_unlock(&qdisc_mod_lock);
176 	return rc;
177 
178 out_einval:
179 	rc = -EINVAL;
180 	goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183 
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186 	struct Qdisc_ops *q, **qp;
187 	int err = -ENOENT;
188 
189 	write_lock(&qdisc_mod_lock);
190 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 		if (q == qops)
192 			break;
193 	if (q) {
194 		*qp = q->next;
195 		q->next = NULL;
196 		err = 0;
197 	}
198 	write_unlock(&qdisc_mod_lock);
199 	return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202 
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206 	read_lock(&qdisc_mod_lock);
207 	strlcpy(name, default_qdisc_ops->id, len);
208 	read_unlock(&qdisc_mod_lock);
209 }
210 
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213 	struct Qdisc_ops *q = NULL;
214 
215 	for (q = qdisc_base; q; q = q->next) {
216 		if (!strcmp(name, q->id)) {
217 			if (!try_module_get(q->owner))
218 				q = NULL;
219 			break;
220 		}
221 	}
222 
223 	return q;
224 }
225 
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229 	const struct Qdisc_ops *ops;
230 
231 	if (!capable(CAP_NET_ADMIN))
232 		return -EPERM;
233 
234 	write_lock(&qdisc_mod_lock);
235 	ops = qdisc_lookup_default(name);
236 	if (!ops) {
237 		/* Not found, drop lock and try to load module */
238 		write_unlock(&qdisc_mod_lock);
239 		request_module("sch_%s", name);
240 		write_lock(&qdisc_mod_lock);
241 
242 		ops = qdisc_lookup_default(name);
243 	}
244 
245 	if (ops) {
246 		/* Set new default */
247 		module_put(default_qdisc_ops->owner);
248 		default_qdisc_ops = ops;
249 	}
250 	write_unlock(&qdisc_mod_lock);
251 
252 	return ops ? 0 : -ENOENT;
253 }
254 
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258 
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261 	struct Qdisc *q;
262 
263 	if (!(root->flags & TCQ_F_BUILTIN) &&
264 	    root->handle == handle)
265 		return root;
266 
267 	list_for_each_entry(q, &root->list, list) {
268 		if (q->handle == handle)
269 			return q;
270 	}
271 	return NULL;
272 }
273 
274 void qdisc_list_add(struct Qdisc *q)
275 {
276 	struct Qdisc *root = qdisc_dev(q)->qdisc;
277 
278 	WARN_ON_ONCE(root == &noop_qdisc);
279 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
280 		list_add_tail(&q->list, &root->list);
281 }
282 EXPORT_SYMBOL(qdisc_list_add);
283 
284 void qdisc_list_del(struct Qdisc *q)
285 {
286 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
287 		list_del(&q->list);
288 }
289 EXPORT_SYMBOL(qdisc_list_del);
290 
291 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
292 {
293 	struct Qdisc *q;
294 
295 	q = qdisc_match_from_root(dev->qdisc, handle);
296 	if (q)
297 		goto out;
298 
299 	if (dev_ingress_queue(dev))
300 		q = qdisc_match_from_root(
301 			dev_ingress_queue(dev)->qdisc_sleeping,
302 			handle);
303 out:
304 	return q;
305 }
306 
307 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
308 {
309 	unsigned long cl;
310 	struct Qdisc *leaf;
311 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
312 
313 	if (cops == NULL)
314 		return NULL;
315 	cl = cops->get(p, classid);
316 
317 	if (cl == 0)
318 		return NULL;
319 	leaf = cops->leaf(p, cl);
320 	cops->put(p, cl);
321 	return leaf;
322 }
323 
324 /* Find queueing discipline by name */
325 
326 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
327 {
328 	struct Qdisc_ops *q = NULL;
329 
330 	if (kind) {
331 		read_lock(&qdisc_mod_lock);
332 		for (q = qdisc_base; q; q = q->next) {
333 			if (nla_strcmp(kind, q->id) == 0) {
334 				if (!try_module_get(q->owner))
335 					q = NULL;
336 				break;
337 			}
338 		}
339 		read_unlock(&qdisc_mod_lock);
340 	}
341 	return q;
342 }
343 
344 /* The linklayer setting were not transferred from iproute2, in older
345  * versions, and the rate tables lookup systems have been dropped in
346  * the kernel. To keep backward compatible with older iproute2 tc
347  * utils, we detect the linklayer setting by detecting if the rate
348  * table were modified.
349  *
350  * For linklayer ATM table entries, the rate table will be aligned to
351  * 48 bytes, thus some table entries will contain the same value.  The
352  * mpu (min packet unit) is also encoded into the old rate table, thus
353  * starting from the mpu, we find low and high table entries for
354  * mapping this cell.  If these entries contain the same value, when
355  * the rate tables have been modified for linklayer ATM.
356  *
357  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
358  * and then roundup to the next cell, calc the table entry one below,
359  * and compare.
360  */
361 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
362 {
363 	int low       = roundup(r->mpu, 48);
364 	int high      = roundup(low+1, 48);
365 	int cell_low  = low >> r->cell_log;
366 	int cell_high = (high >> r->cell_log) - 1;
367 
368 	/* rtab is too inaccurate at rates > 100Mbit/s */
369 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
370 		pr_debug("TC linklayer: Giving up ATM detection\n");
371 		return TC_LINKLAYER_ETHERNET;
372 	}
373 
374 	if ((cell_high > cell_low) && (cell_high < 256)
375 	    && (rtab[cell_low] == rtab[cell_high])) {
376 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
377 			 cell_low, cell_high, rtab[cell_high]);
378 		return TC_LINKLAYER_ATM;
379 	}
380 	return TC_LINKLAYER_ETHERNET;
381 }
382 
383 static struct qdisc_rate_table *qdisc_rtab_list;
384 
385 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
386 {
387 	struct qdisc_rate_table *rtab;
388 
389 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
390 	    nla_len(tab) != TC_RTAB_SIZE)
391 		return NULL;
392 
393 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
394 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
395 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
396 			rtab->refcnt++;
397 			return rtab;
398 		}
399 	}
400 
401 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
402 	if (rtab) {
403 		rtab->rate = *r;
404 		rtab->refcnt = 1;
405 		memcpy(rtab->data, nla_data(tab), 1024);
406 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
407 			r->linklayer = __detect_linklayer(r, rtab->data);
408 		rtab->next = qdisc_rtab_list;
409 		qdisc_rtab_list = rtab;
410 	}
411 	return rtab;
412 }
413 EXPORT_SYMBOL(qdisc_get_rtab);
414 
415 void qdisc_put_rtab(struct qdisc_rate_table *tab)
416 {
417 	struct qdisc_rate_table *rtab, **rtabp;
418 
419 	if (!tab || --tab->refcnt)
420 		return;
421 
422 	for (rtabp = &qdisc_rtab_list;
423 	     (rtab = *rtabp) != NULL;
424 	     rtabp = &rtab->next) {
425 		if (rtab == tab) {
426 			*rtabp = rtab->next;
427 			kfree(rtab);
428 			return;
429 		}
430 	}
431 }
432 EXPORT_SYMBOL(qdisc_put_rtab);
433 
434 static LIST_HEAD(qdisc_stab_list);
435 static DEFINE_SPINLOCK(qdisc_stab_lock);
436 
437 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
438 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
439 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
440 };
441 
442 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
443 {
444 	struct nlattr *tb[TCA_STAB_MAX + 1];
445 	struct qdisc_size_table *stab;
446 	struct tc_sizespec *s;
447 	unsigned int tsize = 0;
448 	u16 *tab = NULL;
449 	int err;
450 
451 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
452 	if (err < 0)
453 		return ERR_PTR(err);
454 	if (!tb[TCA_STAB_BASE])
455 		return ERR_PTR(-EINVAL);
456 
457 	s = nla_data(tb[TCA_STAB_BASE]);
458 
459 	if (s->tsize > 0) {
460 		if (!tb[TCA_STAB_DATA])
461 			return ERR_PTR(-EINVAL);
462 		tab = nla_data(tb[TCA_STAB_DATA]);
463 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
464 	}
465 
466 	if (tsize != s->tsize || (!tab && tsize > 0))
467 		return ERR_PTR(-EINVAL);
468 
469 	spin_lock(&qdisc_stab_lock);
470 
471 	list_for_each_entry(stab, &qdisc_stab_list, list) {
472 		if (memcmp(&stab->szopts, s, sizeof(*s)))
473 			continue;
474 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
475 			continue;
476 		stab->refcnt++;
477 		spin_unlock(&qdisc_stab_lock);
478 		return stab;
479 	}
480 
481 	spin_unlock(&qdisc_stab_lock);
482 
483 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
484 	if (!stab)
485 		return ERR_PTR(-ENOMEM);
486 
487 	stab->refcnt = 1;
488 	stab->szopts = *s;
489 	if (tsize > 0)
490 		memcpy(stab->data, tab, tsize * sizeof(u16));
491 
492 	spin_lock(&qdisc_stab_lock);
493 	list_add_tail(&stab->list, &qdisc_stab_list);
494 	spin_unlock(&qdisc_stab_lock);
495 
496 	return stab;
497 }
498 
499 static void stab_kfree_rcu(struct rcu_head *head)
500 {
501 	kfree(container_of(head, struct qdisc_size_table, rcu));
502 }
503 
504 void qdisc_put_stab(struct qdisc_size_table *tab)
505 {
506 	if (!tab)
507 		return;
508 
509 	spin_lock(&qdisc_stab_lock);
510 
511 	if (--tab->refcnt == 0) {
512 		list_del(&tab->list);
513 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
514 	}
515 
516 	spin_unlock(&qdisc_stab_lock);
517 }
518 EXPORT_SYMBOL(qdisc_put_stab);
519 
520 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
521 {
522 	struct nlattr *nest;
523 
524 	nest = nla_nest_start(skb, TCA_STAB);
525 	if (nest == NULL)
526 		goto nla_put_failure;
527 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
528 		goto nla_put_failure;
529 	nla_nest_end(skb, nest);
530 
531 	return skb->len;
532 
533 nla_put_failure:
534 	return -1;
535 }
536 
537 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
538 {
539 	int pkt_len, slot;
540 
541 	pkt_len = skb->len + stab->szopts.overhead;
542 	if (unlikely(!stab->szopts.tsize))
543 		goto out;
544 
545 	slot = pkt_len + stab->szopts.cell_align;
546 	if (unlikely(slot < 0))
547 		slot = 0;
548 
549 	slot >>= stab->szopts.cell_log;
550 	if (likely(slot < stab->szopts.tsize))
551 		pkt_len = stab->data[slot];
552 	else
553 		pkt_len = stab->data[stab->szopts.tsize - 1] *
554 				(slot / stab->szopts.tsize) +
555 				stab->data[slot % stab->szopts.tsize];
556 
557 	pkt_len <<= stab->szopts.size_log;
558 out:
559 	if (unlikely(pkt_len < 1))
560 		pkt_len = 1;
561 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
562 }
563 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
564 
565 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
566 {
567 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
568 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
569 			txt, qdisc->ops->id, qdisc->handle >> 16);
570 		qdisc->flags |= TCQ_F_WARN_NONWC;
571 	}
572 }
573 EXPORT_SYMBOL(qdisc_warn_nonwc);
574 
575 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
576 {
577 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
578 						 timer);
579 
580 	qdisc_unthrottled(wd->qdisc);
581 	__netif_schedule(qdisc_root(wd->qdisc));
582 
583 	return HRTIMER_NORESTART;
584 }
585 
586 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
587 {
588 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
589 	wd->timer.function = qdisc_watchdog;
590 	wd->qdisc = qdisc;
591 }
592 EXPORT_SYMBOL(qdisc_watchdog_init);
593 
594 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
595 {
596 	if (test_bit(__QDISC_STATE_DEACTIVATED,
597 		     &qdisc_root_sleeping(wd->qdisc)->state))
598 		return;
599 
600 	qdisc_throttled(wd->qdisc);
601 
602 	hrtimer_start(&wd->timer,
603 		      ns_to_ktime(expires),
604 		      HRTIMER_MODE_ABS);
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
607 
608 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
609 {
610 	hrtimer_cancel(&wd->timer);
611 	qdisc_unthrottled(wd->qdisc);
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_cancel);
614 
615 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
616 {
617 	unsigned int size = n * sizeof(struct hlist_head), i;
618 	struct hlist_head *h;
619 
620 	if (size <= PAGE_SIZE)
621 		h = kmalloc(size, GFP_KERNEL);
622 	else
623 		h = (struct hlist_head *)
624 			__get_free_pages(GFP_KERNEL, get_order(size));
625 
626 	if (h != NULL) {
627 		for (i = 0; i < n; i++)
628 			INIT_HLIST_HEAD(&h[i]);
629 	}
630 	return h;
631 }
632 
633 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
634 {
635 	unsigned int size = n * sizeof(struct hlist_head);
636 
637 	if (size <= PAGE_SIZE)
638 		kfree(h);
639 	else
640 		free_pages((unsigned long)h, get_order(size));
641 }
642 
643 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
644 {
645 	struct Qdisc_class_common *cl;
646 	struct hlist_node *next;
647 	struct hlist_head *nhash, *ohash;
648 	unsigned int nsize, nmask, osize;
649 	unsigned int i, h;
650 
651 	/* Rehash when load factor exceeds 0.75 */
652 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
653 		return;
654 	nsize = clhash->hashsize * 2;
655 	nmask = nsize - 1;
656 	nhash = qdisc_class_hash_alloc(nsize);
657 	if (nhash == NULL)
658 		return;
659 
660 	ohash = clhash->hash;
661 	osize = clhash->hashsize;
662 
663 	sch_tree_lock(sch);
664 	for (i = 0; i < osize; i++) {
665 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
666 			h = qdisc_class_hash(cl->classid, nmask);
667 			hlist_add_head(&cl->hnode, &nhash[h]);
668 		}
669 	}
670 	clhash->hash     = nhash;
671 	clhash->hashsize = nsize;
672 	clhash->hashmask = nmask;
673 	sch_tree_unlock(sch);
674 
675 	qdisc_class_hash_free(ohash, osize);
676 }
677 EXPORT_SYMBOL(qdisc_class_hash_grow);
678 
679 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
680 {
681 	unsigned int size = 4;
682 
683 	clhash->hash = qdisc_class_hash_alloc(size);
684 	if (clhash->hash == NULL)
685 		return -ENOMEM;
686 	clhash->hashsize  = size;
687 	clhash->hashmask  = size - 1;
688 	clhash->hashelems = 0;
689 	return 0;
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_init);
692 
693 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
694 {
695 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
696 }
697 EXPORT_SYMBOL(qdisc_class_hash_destroy);
698 
699 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
700 			     struct Qdisc_class_common *cl)
701 {
702 	unsigned int h;
703 
704 	INIT_HLIST_NODE(&cl->hnode);
705 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
706 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
707 	clhash->hashelems++;
708 }
709 EXPORT_SYMBOL(qdisc_class_hash_insert);
710 
711 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
712 			     struct Qdisc_class_common *cl)
713 {
714 	hlist_del(&cl->hnode);
715 	clhash->hashelems--;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_remove);
718 
719 /* Allocate an unique handle from space managed by kernel
720  * Possible range is [8000-FFFF]:0000 (0x8000 values)
721  */
722 static u32 qdisc_alloc_handle(struct net_device *dev)
723 {
724 	int i = 0x8000;
725 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
726 
727 	do {
728 		autohandle += TC_H_MAKE(0x10000U, 0);
729 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
730 			autohandle = TC_H_MAKE(0x80000000U, 0);
731 		if (!qdisc_lookup(dev, autohandle))
732 			return autohandle;
733 		cond_resched();
734 	} while	(--i > 0);
735 
736 	return 0;
737 }
738 
739 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
740 {
741 	const struct Qdisc_class_ops *cops;
742 	unsigned long cl;
743 	u32 parentid;
744 	int drops;
745 
746 	if (n == 0)
747 		return;
748 	drops = max_t(int, n, 0);
749 	while ((parentid = sch->parent)) {
750 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
751 			return;
752 
753 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
754 		if (sch == NULL) {
755 			WARN_ON(parentid != TC_H_ROOT);
756 			return;
757 		}
758 		cops = sch->ops->cl_ops;
759 		if (cops->qlen_notify) {
760 			cl = cops->get(sch, parentid);
761 			cops->qlen_notify(sch, cl);
762 			cops->put(sch, cl);
763 		}
764 		sch->q.qlen -= n;
765 		sch->qstats.drops += drops;
766 	}
767 }
768 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
769 
770 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
771 			       struct nlmsghdr *n, u32 clid,
772 			       struct Qdisc *old, struct Qdisc *new)
773 {
774 	if (new || old)
775 		qdisc_notify(net, skb, n, clid, old, new);
776 
777 	if (old)
778 		qdisc_destroy(old);
779 }
780 
781 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
782  * to device "dev".
783  *
784  * When appropriate send a netlink notification using 'skb'
785  * and "n".
786  *
787  * On success, destroy old qdisc.
788  */
789 
790 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
791 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
792 		       struct Qdisc *new, struct Qdisc *old)
793 {
794 	struct Qdisc *q = old;
795 	struct net *net = dev_net(dev);
796 	int err = 0;
797 
798 	if (parent == NULL) {
799 		unsigned int i, num_q, ingress;
800 
801 		ingress = 0;
802 		num_q = dev->num_tx_queues;
803 		if ((q && q->flags & TCQ_F_INGRESS) ||
804 		    (new && new->flags & TCQ_F_INGRESS)) {
805 			num_q = 1;
806 			ingress = 1;
807 			if (!dev_ingress_queue(dev))
808 				return -ENOENT;
809 		}
810 
811 		if (dev->flags & IFF_UP)
812 			dev_deactivate(dev);
813 
814 		if (new && new->ops->attach) {
815 			new->ops->attach(new);
816 			num_q = 0;
817 		}
818 
819 		for (i = 0; i < num_q; i++) {
820 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
821 
822 			if (!ingress)
823 				dev_queue = netdev_get_tx_queue(dev, i);
824 
825 			old = dev_graft_qdisc(dev_queue, new);
826 			if (new && i > 0)
827 				atomic_inc(&new->refcnt);
828 
829 			if (!ingress)
830 				qdisc_destroy(old);
831 		}
832 
833 		if (!ingress) {
834 			notify_and_destroy(net, skb, n, classid,
835 					   dev->qdisc, new);
836 			if (new && !new->ops->attach)
837 				atomic_inc(&new->refcnt);
838 			dev->qdisc = new ? : &noop_qdisc;
839 		} else {
840 			notify_and_destroy(net, skb, n, classid, old, new);
841 		}
842 
843 		if (dev->flags & IFF_UP)
844 			dev_activate(dev);
845 	} else {
846 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
847 
848 		err = -EOPNOTSUPP;
849 		if (cops && cops->graft) {
850 			unsigned long cl = cops->get(parent, classid);
851 			if (cl) {
852 				err = cops->graft(parent, cl, new, &old);
853 				cops->put(parent, cl);
854 			} else
855 				err = -ENOENT;
856 		}
857 		if (!err)
858 			notify_and_destroy(net, skb, n, classid, old, new);
859 	}
860 	return err;
861 }
862 
863 /* lockdep annotation is needed for ingress; egress gets it only for name */
864 static struct lock_class_key qdisc_tx_lock;
865 static struct lock_class_key qdisc_rx_lock;
866 
867 /*
868    Allocate and initialize new qdisc.
869 
870    Parameters are passed via opt.
871  */
872 
873 static struct Qdisc *
874 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
875 	     struct Qdisc *p, u32 parent, u32 handle,
876 	     struct nlattr **tca, int *errp)
877 {
878 	int err;
879 	struct nlattr *kind = tca[TCA_KIND];
880 	struct Qdisc *sch;
881 	struct Qdisc_ops *ops;
882 	struct qdisc_size_table *stab;
883 
884 	ops = qdisc_lookup_ops(kind);
885 #ifdef CONFIG_MODULES
886 	if (ops == NULL && kind != NULL) {
887 		char name[IFNAMSIZ];
888 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
889 			/* We dropped the RTNL semaphore in order to
890 			 * perform the module load.  So, even if we
891 			 * succeeded in loading the module we have to
892 			 * tell the caller to replay the request.  We
893 			 * indicate this using -EAGAIN.
894 			 * We replay the request because the device may
895 			 * go away in the mean time.
896 			 */
897 			rtnl_unlock();
898 			request_module("sch_%s", name);
899 			rtnl_lock();
900 			ops = qdisc_lookup_ops(kind);
901 			if (ops != NULL) {
902 				/* We will try again qdisc_lookup_ops,
903 				 * so don't keep a reference.
904 				 */
905 				module_put(ops->owner);
906 				err = -EAGAIN;
907 				goto err_out;
908 			}
909 		}
910 	}
911 #endif
912 
913 	err = -ENOENT;
914 	if (ops == NULL)
915 		goto err_out;
916 
917 	sch = qdisc_alloc(dev_queue, ops);
918 	if (IS_ERR(sch)) {
919 		err = PTR_ERR(sch);
920 		goto err_out2;
921 	}
922 
923 	sch->parent = parent;
924 
925 	if (handle == TC_H_INGRESS) {
926 		sch->flags |= TCQ_F_INGRESS;
927 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
928 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
929 	} else {
930 		if (handle == 0) {
931 			handle = qdisc_alloc_handle(dev);
932 			err = -ENOMEM;
933 			if (handle == 0)
934 				goto err_out3;
935 		}
936 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
937 		if (!netif_is_multiqueue(dev))
938 			sch->flags |= TCQ_F_ONETXQUEUE;
939 	}
940 
941 	sch->handle = handle;
942 
943 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
944 		if (tca[TCA_STAB]) {
945 			stab = qdisc_get_stab(tca[TCA_STAB]);
946 			if (IS_ERR(stab)) {
947 				err = PTR_ERR(stab);
948 				goto err_out4;
949 			}
950 			rcu_assign_pointer(sch->stab, stab);
951 		}
952 		if (tca[TCA_RATE]) {
953 			spinlock_t *root_lock;
954 
955 			err = -EOPNOTSUPP;
956 			if (sch->flags & TCQ_F_MQROOT)
957 				goto err_out4;
958 
959 			if ((sch->parent != TC_H_ROOT) &&
960 			    !(sch->flags & TCQ_F_INGRESS) &&
961 			    (!p || !(p->flags & TCQ_F_MQROOT)))
962 				root_lock = qdisc_root_sleeping_lock(sch);
963 			else
964 				root_lock = qdisc_lock(sch);
965 
966 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
967 						root_lock, tca[TCA_RATE]);
968 			if (err)
969 				goto err_out4;
970 		}
971 
972 		qdisc_list_add(sch);
973 
974 		return sch;
975 	}
976 err_out3:
977 	dev_put(dev);
978 	kfree((char *) sch - sch->padded);
979 err_out2:
980 	module_put(ops->owner);
981 err_out:
982 	*errp = err;
983 	return NULL;
984 
985 err_out4:
986 	/*
987 	 * Any broken qdiscs that would require a ops->reset() here?
988 	 * The qdisc was never in action so it shouldn't be necessary.
989 	 */
990 	qdisc_put_stab(rtnl_dereference(sch->stab));
991 	if (ops->destroy)
992 		ops->destroy(sch);
993 	goto err_out3;
994 }
995 
996 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
997 {
998 	struct qdisc_size_table *ostab, *stab = NULL;
999 	int err = 0;
1000 
1001 	if (tca[TCA_OPTIONS]) {
1002 		if (sch->ops->change == NULL)
1003 			return -EINVAL;
1004 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1005 		if (err)
1006 			return err;
1007 	}
1008 
1009 	if (tca[TCA_STAB]) {
1010 		stab = qdisc_get_stab(tca[TCA_STAB]);
1011 		if (IS_ERR(stab))
1012 			return PTR_ERR(stab);
1013 	}
1014 
1015 	ostab = rtnl_dereference(sch->stab);
1016 	rcu_assign_pointer(sch->stab, stab);
1017 	qdisc_put_stab(ostab);
1018 
1019 	if (tca[TCA_RATE]) {
1020 		/* NB: ignores errors from replace_estimator
1021 		   because change can't be undone. */
1022 		if (sch->flags & TCQ_F_MQROOT)
1023 			goto out;
1024 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
1025 					    qdisc_root_sleeping_lock(sch),
1026 					    tca[TCA_RATE]);
1027 	}
1028 out:
1029 	return 0;
1030 }
1031 
1032 struct check_loop_arg {
1033 	struct qdisc_walker	w;
1034 	struct Qdisc		*p;
1035 	int			depth;
1036 };
1037 
1038 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1039 
1040 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1041 {
1042 	struct check_loop_arg	arg;
1043 
1044 	if (q->ops->cl_ops == NULL)
1045 		return 0;
1046 
1047 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1048 	arg.w.fn = check_loop_fn;
1049 	arg.depth = depth;
1050 	arg.p = p;
1051 	q->ops->cl_ops->walk(q, &arg.w);
1052 	return arg.w.stop ? -ELOOP : 0;
1053 }
1054 
1055 static int
1056 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1057 {
1058 	struct Qdisc *leaf;
1059 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1060 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1061 
1062 	leaf = cops->leaf(q, cl);
1063 	if (leaf) {
1064 		if (leaf == arg->p || arg->depth > 7)
1065 			return -ELOOP;
1066 		return check_loop(leaf, arg->p, arg->depth + 1);
1067 	}
1068 	return 0;
1069 }
1070 
1071 /*
1072  * Delete/get qdisc.
1073  */
1074 
1075 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1076 {
1077 	struct net *net = sock_net(skb->sk);
1078 	struct tcmsg *tcm = nlmsg_data(n);
1079 	struct nlattr *tca[TCA_MAX + 1];
1080 	struct net_device *dev;
1081 	u32 clid;
1082 	struct Qdisc *q = NULL;
1083 	struct Qdisc *p = NULL;
1084 	int err;
1085 
1086 	if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1087 		return -EPERM;
1088 
1089 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1090 	if (err < 0)
1091 		return err;
1092 
1093 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1094 	if (!dev)
1095 		return -ENODEV;
1096 
1097 	clid = tcm->tcm_parent;
1098 	if (clid) {
1099 		if (clid != TC_H_ROOT) {
1100 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1101 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1102 				if (!p)
1103 					return -ENOENT;
1104 				q = qdisc_leaf(p, clid);
1105 			} else if (dev_ingress_queue(dev)) {
1106 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1107 			}
1108 		} else {
1109 			q = dev->qdisc;
1110 		}
1111 		if (!q)
1112 			return -ENOENT;
1113 
1114 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1115 			return -EINVAL;
1116 	} else {
1117 		q = qdisc_lookup(dev, tcm->tcm_handle);
1118 		if (!q)
1119 			return -ENOENT;
1120 	}
1121 
1122 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1123 		return -EINVAL;
1124 
1125 	if (n->nlmsg_type == RTM_DELQDISC) {
1126 		if (!clid)
1127 			return -EINVAL;
1128 		if (q->handle == 0)
1129 			return -ENOENT;
1130 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1131 		if (err != 0)
1132 			return err;
1133 	} else {
1134 		qdisc_notify(net, skb, n, clid, NULL, q);
1135 	}
1136 	return 0;
1137 }
1138 
1139 /*
1140  * Create/change qdisc.
1141  */
1142 
1143 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1144 {
1145 	struct net *net = sock_net(skb->sk);
1146 	struct tcmsg *tcm;
1147 	struct nlattr *tca[TCA_MAX + 1];
1148 	struct net_device *dev;
1149 	u32 clid;
1150 	struct Qdisc *q, *p;
1151 	int err;
1152 
1153 	if (!capable(CAP_NET_ADMIN))
1154 		return -EPERM;
1155 
1156 replay:
1157 	/* Reinit, just in case something touches this. */
1158 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1159 	if (err < 0)
1160 		return err;
1161 
1162 	tcm = nlmsg_data(n);
1163 	clid = tcm->tcm_parent;
1164 	q = p = NULL;
1165 
1166 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1167 	if (!dev)
1168 		return -ENODEV;
1169 
1170 
1171 	if (clid) {
1172 		if (clid != TC_H_ROOT) {
1173 			if (clid != TC_H_INGRESS) {
1174 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1175 				if (!p)
1176 					return -ENOENT;
1177 				q = qdisc_leaf(p, clid);
1178 			} else if (dev_ingress_queue_create(dev)) {
1179 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1180 			}
1181 		} else {
1182 			q = dev->qdisc;
1183 		}
1184 
1185 		/* It may be default qdisc, ignore it */
1186 		if (q && q->handle == 0)
1187 			q = NULL;
1188 
1189 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1190 			if (tcm->tcm_handle) {
1191 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1192 					return -EEXIST;
1193 				if (TC_H_MIN(tcm->tcm_handle))
1194 					return -EINVAL;
1195 				q = qdisc_lookup(dev, tcm->tcm_handle);
1196 				if (!q)
1197 					goto create_n_graft;
1198 				if (n->nlmsg_flags & NLM_F_EXCL)
1199 					return -EEXIST;
1200 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1201 					return -EINVAL;
1202 				if (q == p ||
1203 				    (p && check_loop(q, p, 0)))
1204 					return -ELOOP;
1205 				atomic_inc(&q->refcnt);
1206 				goto graft;
1207 			} else {
1208 				if (!q)
1209 					goto create_n_graft;
1210 
1211 				/* This magic test requires explanation.
1212 				 *
1213 				 *   We know, that some child q is already
1214 				 *   attached to this parent and have choice:
1215 				 *   either to change it or to create/graft new one.
1216 				 *
1217 				 *   1. We are allowed to create/graft only
1218 				 *   if CREATE and REPLACE flags are set.
1219 				 *
1220 				 *   2. If EXCL is set, requestor wanted to say,
1221 				 *   that qdisc tcm_handle is not expected
1222 				 *   to exist, so that we choose create/graft too.
1223 				 *
1224 				 *   3. The last case is when no flags are set.
1225 				 *   Alas, it is sort of hole in API, we
1226 				 *   cannot decide what to do unambiguously.
1227 				 *   For now we select create/graft, if
1228 				 *   user gave KIND, which does not match existing.
1229 				 */
1230 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1231 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1232 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1233 				     (tca[TCA_KIND] &&
1234 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1235 					goto create_n_graft;
1236 			}
1237 		}
1238 	} else {
1239 		if (!tcm->tcm_handle)
1240 			return -EINVAL;
1241 		q = qdisc_lookup(dev, tcm->tcm_handle);
1242 	}
1243 
1244 	/* Change qdisc parameters */
1245 	if (q == NULL)
1246 		return -ENOENT;
1247 	if (n->nlmsg_flags & NLM_F_EXCL)
1248 		return -EEXIST;
1249 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1250 		return -EINVAL;
1251 	err = qdisc_change(q, tca);
1252 	if (err == 0)
1253 		qdisc_notify(net, skb, n, clid, NULL, q);
1254 	return err;
1255 
1256 create_n_graft:
1257 	if (!(n->nlmsg_flags & NLM_F_CREATE))
1258 		return -ENOENT;
1259 	if (clid == TC_H_INGRESS) {
1260 		if (dev_ingress_queue(dev))
1261 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1262 					 tcm->tcm_parent, tcm->tcm_parent,
1263 					 tca, &err);
1264 		else
1265 			err = -ENOENT;
1266 	} else {
1267 		struct netdev_queue *dev_queue;
1268 
1269 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1270 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1271 		else if (p)
1272 			dev_queue = p->dev_queue;
1273 		else
1274 			dev_queue = netdev_get_tx_queue(dev, 0);
1275 
1276 		q = qdisc_create(dev, dev_queue, p,
1277 				 tcm->tcm_parent, tcm->tcm_handle,
1278 				 tca, &err);
1279 	}
1280 	if (q == NULL) {
1281 		if (err == -EAGAIN)
1282 			goto replay;
1283 		return err;
1284 	}
1285 
1286 graft:
1287 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1288 	if (err) {
1289 		if (q)
1290 			qdisc_destroy(q);
1291 		return err;
1292 	}
1293 
1294 	return 0;
1295 }
1296 
1297 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1298 			 u32 portid, u32 seq, u16 flags, int event)
1299 {
1300 	struct tcmsg *tcm;
1301 	struct nlmsghdr  *nlh;
1302 	unsigned char *b = skb_tail_pointer(skb);
1303 	struct gnet_dump d;
1304 	struct qdisc_size_table *stab;
1305 
1306 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1307 	if (!nlh)
1308 		goto out_nlmsg_trim;
1309 	tcm = nlmsg_data(nlh);
1310 	tcm->tcm_family = AF_UNSPEC;
1311 	tcm->tcm__pad1 = 0;
1312 	tcm->tcm__pad2 = 0;
1313 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1314 	tcm->tcm_parent = clid;
1315 	tcm->tcm_handle = q->handle;
1316 	tcm->tcm_info = atomic_read(&q->refcnt);
1317 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1318 		goto nla_put_failure;
1319 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1320 		goto nla_put_failure;
1321 	q->qstats.qlen = q->q.qlen;
1322 
1323 	stab = rtnl_dereference(q->stab);
1324 	if (stab && qdisc_dump_stab(skb, stab) < 0)
1325 		goto nla_put_failure;
1326 
1327 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1328 					 qdisc_root_sleeping_lock(q), &d) < 0)
1329 		goto nla_put_failure;
1330 
1331 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1332 		goto nla_put_failure;
1333 
1334 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1335 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1336 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1337 		goto nla_put_failure;
1338 
1339 	if (gnet_stats_finish_copy(&d) < 0)
1340 		goto nla_put_failure;
1341 
1342 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1343 	return skb->len;
1344 
1345 out_nlmsg_trim:
1346 nla_put_failure:
1347 	nlmsg_trim(skb, b);
1348 	return -1;
1349 }
1350 
1351 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1352 {
1353 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1354 }
1355 
1356 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1357 			struct nlmsghdr *n, u32 clid,
1358 			struct Qdisc *old, struct Qdisc *new)
1359 {
1360 	struct sk_buff *skb;
1361 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1362 
1363 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1364 	if (!skb)
1365 		return -ENOBUFS;
1366 
1367 	if (old && !tc_qdisc_dump_ignore(old)) {
1368 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1369 				  0, RTM_DELQDISC) < 0)
1370 			goto err_out;
1371 	}
1372 	if (new && !tc_qdisc_dump_ignore(new)) {
1373 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1374 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1375 			goto err_out;
1376 	}
1377 
1378 	if (skb->len)
1379 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1380 				      n->nlmsg_flags & NLM_F_ECHO);
1381 
1382 err_out:
1383 	kfree_skb(skb);
1384 	return -EINVAL;
1385 }
1386 
1387 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1388 			      struct netlink_callback *cb,
1389 			      int *q_idx_p, int s_q_idx)
1390 {
1391 	int ret = 0, q_idx = *q_idx_p;
1392 	struct Qdisc *q;
1393 
1394 	if (!root)
1395 		return 0;
1396 
1397 	q = root;
1398 	if (q_idx < s_q_idx) {
1399 		q_idx++;
1400 	} else {
1401 		if (!tc_qdisc_dump_ignore(q) &&
1402 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1403 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1404 			goto done;
1405 		q_idx++;
1406 	}
1407 	list_for_each_entry(q, &root->list, list) {
1408 		if (q_idx < s_q_idx) {
1409 			q_idx++;
1410 			continue;
1411 		}
1412 		if (!tc_qdisc_dump_ignore(q) &&
1413 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1414 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1415 			goto done;
1416 		q_idx++;
1417 	}
1418 
1419 out:
1420 	*q_idx_p = q_idx;
1421 	return ret;
1422 done:
1423 	ret = -1;
1424 	goto out;
1425 }
1426 
1427 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1428 {
1429 	struct net *net = sock_net(skb->sk);
1430 	int idx, q_idx;
1431 	int s_idx, s_q_idx;
1432 	struct net_device *dev;
1433 
1434 	s_idx = cb->args[0];
1435 	s_q_idx = q_idx = cb->args[1];
1436 
1437 	rcu_read_lock();
1438 	idx = 0;
1439 	for_each_netdev_rcu(net, dev) {
1440 		struct netdev_queue *dev_queue;
1441 
1442 		if (idx < s_idx)
1443 			goto cont;
1444 		if (idx > s_idx)
1445 			s_q_idx = 0;
1446 		q_idx = 0;
1447 
1448 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1449 			goto done;
1450 
1451 		dev_queue = dev_ingress_queue(dev);
1452 		if (dev_queue &&
1453 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1454 				       &q_idx, s_q_idx) < 0)
1455 			goto done;
1456 
1457 cont:
1458 		idx++;
1459 	}
1460 
1461 done:
1462 	rcu_read_unlock();
1463 
1464 	cb->args[0] = idx;
1465 	cb->args[1] = q_idx;
1466 
1467 	return skb->len;
1468 }
1469 
1470 
1471 
1472 /************************************************
1473  *	Traffic classes manipulation.		*
1474  ************************************************/
1475 
1476 
1477 
1478 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1479 {
1480 	struct net *net = sock_net(skb->sk);
1481 	struct tcmsg *tcm = nlmsg_data(n);
1482 	struct nlattr *tca[TCA_MAX + 1];
1483 	struct net_device *dev;
1484 	struct Qdisc *q = NULL;
1485 	const struct Qdisc_class_ops *cops;
1486 	unsigned long cl = 0;
1487 	unsigned long new_cl;
1488 	u32 portid;
1489 	u32 clid;
1490 	u32 qid;
1491 	int err;
1492 
1493 	if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1494 		return -EPERM;
1495 
1496 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1497 	if (err < 0)
1498 		return err;
1499 
1500 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1501 	if (!dev)
1502 		return -ENODEV;
1503 
1504 	/*
1505 	   parent == TC_H_UNSPEC - unspecified parent.
1506 	   parent == TC_H_ROOT   - class is root, which has no parent.
1507 	   parent == X:0	 - parent is root class.
1508 	   parent == X:Y	 - parent is a node in hierarchy.
1509 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1510 
1511 	   handle == 0:0	 - generate handle from kernel pool.
1512 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1513 	   handle == X:Y	 - clear.
1514 	   handle == X:0	 - root class.
1515 	 */
1516 
1517 	/* Step 1. Determine qdisc handle X:0 */
1518 
1519 	portid = tcm->tcm_parent;
1520 	clid = tcm->tcm_handle;
1521 	qid = TC_H_MAJ(clid);
1522 
1523 	if (portid != TC_H_ROOT) {
1524 		u32 qid1 = TC_H_MAJ(portid);
1525 
1526 		if (qid && qid1) {
1527 			/* If both majors are known, they must be identical. */
1528 			if (qid != qid1)
1529 				return -EINVAL;
1530 		} else if (qid1) {
1531 			qid = qid1;
1532 		} else if (qid == 0)
1533 			qid = dev->qdisc->handle;
1534 
1535 		/* Now qid is genuine qdisc handle consistent
1536 		 * both with parent and child.
1537 		 *
1538 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1539 		 */
1540 		if (portid)
1541 			portid = TC_H_MAKE(qid, portid);
1542 	} else {
1543 		if (qid == 0)
1544 			qid = dev->qdisc->handle;
1545 	}
1546 
1547 	/* OK. Locate qdisc */
1548 	q = qdisc_lookup(dev, qid);
1549 	if (!q)
1550 		return -ENOENT;
1551 
1552 	/* An check that it supports classes */
1553 	cops = q->ops->cl_ops;
1554 	if (cops == NULL)
1555 		return -EINVAL;
1556 
1557 	/* Now try to get class */
1558 	if (clid == 0) {
1559 		if (portid == TC_H_ROOT)
1560 			clid = qid;
1561 	} else
1562 		clid = TC_H_MAKE(qid, clid);
1563 
1564 	if (clid)
1565 		cl = cops->get(q, clid);
1566 
1567 	if (cl == 0) {
1568 		err = -ENOENT;
1569 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1570 		    !(n->nlmsg_flags & NLM_F_CREATE))
1571 			goto out;
1572 	} else {
1573 		switch (n->nlmsg_type) {
1574 		case RTM_NEWTCLASS:
1575 			err = -EEXIST;
1576 			if (n->nlmsg_flags & NLM_F_EXCL)
1577 				goto out;
1578 			break;
1579 		case RTM_DELTCLASS:
1580 			err = -EOPNOTSUPP;
1581 			if (cops->delete)
1582 				err = cops->delete(q, cl);
1583 			if (err == 0)
1584 				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1585 			goto out;
1586 		case RTM_GETTCLASS:
1587 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1588 			goto out;
1589 		default:
1590 			err = -EINVAL;
1591 			goto out;
1592 		}
1593 	}
1594 
1595 	new_cl = cl;
1596 	err = -EOPNOTSUPP;
1597 	if (cops->change)
1598 		err = cops->change(q, clid, portid, tca, &new_cl);
1599 	if (err == 0)
1600 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1601 
1602 out:
1603 	if (cl)
1604 		cops->put(q, cl);
1605 
1606 	return err;
1607 }
1608 
1609 
1610 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1611 			  unsigned long cl,
1612 			  u32 portid, u32 seq, u16 flags, int event)
1613 {
1614 	struct tcmsg *tcm;
1615 	struct nlmsghdr  *nlh;
1616 	unsigned char *b = skb_tail_pointer(skb);
1617 	struct gnet_dump d;
1618 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1619 
1620 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1621 	if (!nlh)
1622 		goto out_nlmsg_trim;
1623 	tcm = nlmsg_data(nlh);
1624 	tcm->tcm_family = AF_UNSPEC;
1625 	tcm->tcm__pad1 = 0;
1626 	tcm->tcm__pad2 = 0;
1627 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1628 	tcm->tcm_parent = q->handle;
1629 	tcm->tcm_handle = q->handle;
1630 	tcm->tcm_info = 0;
1631 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1632 		goto nla_put_failure;
1633 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1634 		goto nla_put_failure;
1635 
1636 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1637 					 qdisc_root_sleeping_lock(q), &d) < 0)
1638 		goto nla_put_failure;
1639 
1640 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1641 		goto nla_put_failure;
1642 
1643 	if (gnet_stats_finish_copy(&d) < 0)
1644 		goto nla_put_failure;
1645 
1646 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1647 	return skb->len;
1648 
1649 out_nlmsg_trim:
1650 nla_put_failure:
1651 	nlmsg_trim(skb, b);
1652 	return -1;
1653 }
1654 
1655 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1656 			 struct nlmsghdr *n, struct Qdisc *q,
1657 			 unsigned long cl, int event)
1658 {
1659 	struct sk_buff *skb;
1660 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1661 
1662 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1663 	if (!skb)
1664 		return -ENOBUFS;
1665 
1666 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1667 		kfree_skb(skb);
1668 		return -EINVAL;
1669 	}
1670 
1671 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1672 			      n->nlmsg_flags & NLM_F_ECHO);
1673 }
1674 
1675 struct qdisc_dump_args {
1676 	struct qdisc_walker	w;
1677 	struct sk_buff		*skb;
1678 	struct netlink_callback	*cb;
1679 };
1680 
1681 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1682 {
1683 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1684 
1685 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1686 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1687 }
1688 
1689 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1690 				struct tcmsg *tcm, struct netlink_callback *cb,
1691 				int *t_p, int s_t)
1692 {
1693 	struct qdisc_dump_args arg;
1694 
1695 	if (tc_qdisc_dump_ignore(q) ||
1696 	    *t_p < s_t || !q->ops->cl_ops ||
1697 	    (tcm->tcm_parent &&
1698 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1699 		(*t_p)++;
1700 		return 0;
1701 	}
1702 	if (*t_p > s_t)
1703 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1704 	arg.w.fn = qdisc_class_dump;
1705 	arg.skb = skb;
1706 	arg.cb = cb;
1707 	arg.w.stop  = 0;
1708 	arg.w.skip = cb->args[1];
1709 	arg.w.count = 0;
1710 	q->ops->cl_ops->walk(q, &arg.w);
1711 	cb->args[1] = arg.w.count;
1712 	if (arg.w.stop)
1713 		return -1;
1714 	(*t_p)++;
1715 	return 0;
1716 }
1717 
1718 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1719 			       struct tcmsg *tcm, struct netlink_callback *cb,
1720 			       int *t_p, int s_t)
1721 {
1722 	struct Qdisc *q;
1723 
1724 	if (!root)
1725 		return 0;
1726 
1727 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1728 		return -1;
1729 
1730 	list_for_each_entry(q, &root->list, list) {
1731 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1732 			return -1;
1733 	}
1734 
1735 	return 0;
1736 }
1737 
1738 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1739 {
1740 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1741 	struct net *net = sock_net(skb->sk);
1742 	struct netdev_queue *dev_queue;
1743 	struct net_device *dev;
1744 	int t, s_t;
1745 
1746 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1747 		return 0;
1748 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1749 	if (!dev)
1750 		return 0;
1751 
1752 	s_t = cb->args[0];
1753 	t = 0;
1754 
1755 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1756 		goto done;
1757 
1758 	dev_queue = dev_ingress_queue(dev);
1759 	if (dev_queue &&
1760 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1761 				&t, s_t) < 0)
1762 		goto done;
1763 
1764 done:
1765 	cb->args[0] = t;
1766 
1767 	dev_put(dev);
1768 	return skb->len;
1769 }
1770 
1771 /* Main classifier routine: scans classifier chain attached
1772  * to this qdisc, (optionally) tests for protocol and asks
1773  * specific classifiers.
1774  */
1775 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1776 		       struct tcf_result *res)
1777 {
1778 	__be16 protocol = skb->protocol;
1779 	int err;
1780 
1781 	for (; tp; tp = tp->next) {
1782 		if (tp->protocol != protocol &&
1783 		    tp->protocol != htons(ETH_P_ALL))
1784 			continue;
1785 		err = tp->classify(skb, tp, res);
1786 
1787 		if (err >= 0) {
1788 #ifdef CONFIG_NET_CLS_ACT
1789 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1790 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1791 #endif
1792 			return err;
1793 		}
1794 	}
1795 	return -1;
1796 }
1797 EXPORT_SYMBOL(tc_classify_compat);
1798 
1799 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1800 		struct tcf_result *res)
1801 {
1802 	int err = 0;
1803 #ifdef CONFIG_NET_CLS_ACT
1804 	const struct tcf_proto *otp = tp;
1805 reclassify:
1806 #endif
1807 
1808 	err = tc_classify_compat(skb, tp, res);
1809 #ifdef CONFIG_NET_CLS_ACT
1810 	if (err == TC_ACT_RECLASSIFY) {
1811 		u32 verd = G_TC_VERD(skb->tc_verd);
1812 		tp = otp;
1813 
1814 		if (verd++ >= MAX_REC_LOOP) {
1815 			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1816 					       tp->q->ops->id,
1817 					       tp->prio & 0xffff,
1818 					       ntohs(tp->protocol));
1819 			return TC_ACT_SHOT;
1820 		}
1821 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1822 		goto reclassify;
1823 	}
1824 #endif
1825 	return err;
1826 }
1827 EXPORT_SYMBOL(tc_classify);
1828 
1829 void tcf_destroy(struct tcf_proto *tp)
1830 {
1831 	tp->ops->destroy(tp);
1832 	module_put(tp->ops->owner);
1833 	kfree(tp);
1834 }
1835 
1836 void tcf_destroy_chain(struct tcf_proto **fl)
1837 {
1838 	struct tcf_proto *tp;
1839 
1840 	while ((tp = *fl) != NULL) {
1841 		*fl = tp->next;
1842 		tcf_destroy(tp);
1843 	}
1844 }
1845 EXPORT_SYMBOL(tcf_destroy_chain);
1846 
1847 #ifdef CONFIG_PROC_FS
1848 static int psched_show(struct seq_file *seq, void *v)
1849 {
1850 	struct timespec ts;
1851 
1852 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1853 	seq_printf(seq, "%08x %08x %08x %08x\n",
1854 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1855 		   1000000,
1856 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1857 
1858 	return 0;
1859 }
1860 
1861 static int psched_open(struct inode *inode, struct file *file)
1862 {
1863 	return single_open(file, psched_show, NULL);
1864 }
1865 
1866 static const struct file_operations psched_fops = {
1867 	.owner = THIS_MODULE,
1868 	.open = psched_open,
1869 	.read  = seq_read,
1870 	.llseek = seq_lseek,
1871 	.release = single_release,
1872 };
1873 
1874 static int __net_init psched_net_init(struct net *net)
1875 {
1876 	struct proc_dir_entry *e;
1877 
1878 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1879 	if (e == NULL)
1880 		return -ENOMEM;
1881 
1882 	return 0;
1883 }
1884 
1885 static void __net_exit psched_net_exit(struct net *net)
1886 {
1887 	remove_proc_entry("psched", net->proc_net);
1888 }
1889 #else
1890 static int __net_init psched_net_init(struct net *net)
1891 {
1892 	return 0;
1893 }
1894 
1895 static void __net_exit psched_net_exit(struct net *net)
1896 {
1897 }
1898 #endif
1899 
1900 static struct pernet_operations psched_net_ops = {
1901 	.init = psched_net_init,
1902 	.exit = psched_net_exit,
1903 };
1904 
1905 static int __init pktsched_init(void)
1906 {
1907 	int err;
1908 
1909 	err = register_pernet_subsys(&psched_net_ops);
1910 	if (err) {
1911 		pr_err("pktsched_init: "
1912 		       "cannot initialize per netns operations\n");
1913 		return err;
1914 	}
1915 
1916 	register_qdisc(&pfifo_fast_ops);
1917 	register_qdisc(&pfifo_qdisc_ops);
1918 	register_qdisc(&bfifo_qdisc_ops);
1919 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1920 	register_qdisc(&mq_qdisc_ops);
1921 
1922 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1923 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1924 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1925 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1926 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1927 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1928 
1929 	return 0;
1930 }
1931 
1932 subsys_initcall(pktsched_init);
1933