xref: /openbmc/linux/net/sched/sch_api.c (revision 362b34faef29f445dda1ef4bf856a64f331a80f7)
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 			struct nlmsghdr *n, u32 clid,
40 			struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 			 struct nlmsghdr *n, struct Qdisc *q,
43 			 unsigned long cl, int event);
44 
45 /*
46 
47    Short review.
48    -------------
49 
50    This file consists of two interrelated parts:
51 
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54 
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59 
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64 
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67 
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73 
74    All real intelligent work is done inside qdisc modules.
75 
76 
77 
78    Every discipline has two major routines: enqueue and dequeue.
79 
80    ---dequeue
81 
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88 
89    ---enqueue
90 
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP 	- this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED	- dropped by police.
99      Expected action: backoff or error to real-time apps.
100 
101    Auxiliary routines:
102 
103    ---peek
104 
105    like dequeue but without removing a packet from the queue
106 
107    ---reset
108 
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111 
112    ---init
113 
114    initializes newly created qdisc.
115 
116    ---destroy
117 
118    destroys resources allocated by init and during lifetime of qdisc.
119 
120    ---change
121 
122    changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  *	Queueing disciplines manipulation.	*
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/uregister queueing discipline */
139 
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142 	struct Qdisc_ops *q, **qp;
143 	int rc = -EEXIST;
144 
145 	write_lock(&qdisc_mod_lock);
146 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 		if (!strcmp(qops->id, q->id))
148 			goto out;
149 
150 	if (qops->enqueue == NULL)
151 		qops->enqueue = noop_qdisc_ops.enqueue;
152 	if (qops->peek == NULL) {
153 		if (qops->dequeue == NULL)
154 			qops->peek = noop_qdisc_ops.peek;
155 		else
156 			goto out_einval;
157 	}
158 	if (qops->dequeue == NULL)
159 		qops->dequeue = noop_qdisc_ops.dequeue;
160 
161 	if (qops->cl_ops) {
162 		const struct Qdisc_class_ops *cops = qops->cl_ops;
163 
164 		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165 			goto out_einval;
166 
167 		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168 			goto out_einval;
169 	}
170 
171 	qops->next = NULL;
172 	*qp = qops;
173 	rc = 0;
174 out:
175 	write_unlock(&qdisc_mod_lock);
176 	return rc;
177 
178 out_einval:
179 	rc = -EINVAL;
180 	goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183 
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186 	struct Qdisc_ops *q, **qp;
187 	int err = -ENOENT;
188 
189 	write_lock(&qdisc_mod_lock);
190 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 		if (q == qops)
192 			break;
193 	if (q) {
194 		*qp = q->next;
195 		q->next = NULL;
196 		err = 0;
197 	}
198 	write_unlock(&qdisc_mod_lock);
199 	return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202 
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206 
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209 	struct Qdisc *q;
210 
211 	if (!(root->flags & TCQ_F_BUILTIN) &&
212 	    root->handle == handle)
213 		return root;
214 
215 	list_for_each_entry(q, &root->list, list) {
216 		if (q->handle == handle)
217 			return q;
218 	}
219 	return NULL;
220 }
221 
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227 
228 void qdisc_list_del(struct Qdisc *q)
229 {
230 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231 		list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234 
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237 	struct Qdisc *q;
238 
239 	q = qdisc_match_from_root(dev->qdisc, handle);
240 	if (q)
241 		goto out;
242 
243 	if (dev_ingress_queue(dev))
244 		q = qdisc_match_from_root(
245 			dev_ingress_queue(dev)->qdisc_sleeping,
246 			handle);
247 out:
248 	return q;
249 }
250 
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253 	unsigned long cl;
254 	struct Qdisc *leaf;
255 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257 	if (cops == NULL)
258 		return NULL;
259 	cl = cops->get(p, classid);
260 
261 	if (cl == 0)
262 		return NULL;
263 	leaf = cops->leaf(p, cl);
264 	cops->put(p, cl);
265 	return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272 	struct Qdisc_ops *q = NULL;
273 
274 	if (kind) {
275 		read_lock(&qdisc_mod_lock);
276 		for (q = qdisc_base; q; q = q->next) {
277 			if (nla_strcmp(kind, q->id) == 0) {
278 				if (!try_module_get(q->owner))
279 					q = NULL;
280 				break;
281 			}
282 		}
283 		read_unlock(&qdisc_mod_lock);
284 	}
285 	return q;
286 }
287 
288 /* The linklayer setting were not transferred from iproute2, in older
289  * versions, and the rate tables lookup systems have been dropped in
290  * the kernel. To keep backward compatible with older iproute2 tc
291  * utils, we detect the linklayer setting by detecting if the rate
292  * table were modified.
293  *
294  * For linklayer ATM table entries, the rate table will be aligned to
295  * 48 bytes, thus some table entries will contain the same value.  The
296  * mpu (min packet unit) is also encoded into the old rate table, thus
297  * starting from the mpu, we find low and high table entries for
298  * mapping this cell.  If these entries contain the same value, when
299  * the rate tables have been modified for linklayer ATM.
300  *
301  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
302  * and then roundup to the next cell, calc the table entry one below,
303  * and compare.
304  */
305 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
306 {
307 	int low       = roundup(r->mpu, 48);
308 	int high      = roundup(low+1, 48);
309 	int cell_low  = low >> r->cell_log;
310 	int cell_high = (high >> r->cell_log) - 1;
311 
312 	/* rtab is too inaccurate at rates > 100Mbit/s */
313 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
314 		pr_debug("TC linklayer: Giving up ATM detection\n");
315 		return TC_LINKLAYER_ETHERNET;
316 	}
317 
318 	if ((cell_high > cell_low) && (cell_high < 256)
319 	    && (rtab[cell_low] == rtab[cell_high])) {
320 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
321 			 cell_low, cell_high, rtab[cell_high]);
322 		return TC_LINKLAYER_ATM;
323 	}
324 	return TC_LINKLAYER_ETHERNET;
325 }
326 
327 static struct qdisc_rate_table *qdisc_rtab_list;
328 
329 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
330 {
331 	struct qdisc_rate_table *rtab;
332 
333 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
334 	    nla_len(tab) != TC_RTAB_SIZE)
335 		return NULL;
336 
337 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
338 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
339 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
340 			rtab->refcnt++;
341 			return rtab;
342 		}
343 	}
344 
345 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
346 	if (rtab) {
347 		rtab->rate = *r;
348 		rtab->refcnt = 1;
349 		memcpy(rtab->data, nla_data(tab), 1024);
350 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
351 			r->linklayer = __detect_linklayer(r, rtab->data);
352 		rtab->next = qdisc_rtab_list;
353 		qdisc_rtab_list = rtab;
354 	}
355 	return rtab;
356 }
357 EXPORT_SYMBOL(qdisc_get_rtab);
358 
359 void qdisc_put_rtab(struct qdisc_rate_table *tab)
360 {
361 	struct qdisc_rate_table *rtab, **rtabp;
362 
363 	if (!tab || --tab->refcnt)
364 		return;
365 
366 	for (rtabp = &qdisc_rtab_list;
367 	     (rtab = *rtabp) != NULL;
368 	     rtabp = &rtab->next) {
369 		if (rtab == tab) {
370 			*rtabp = rtab->next;
371 			kfree(rtab);
372 			return;
373 		}
374 	}
375 }
376 EXPORT_SYMBOL(qdisc_put_rtab);
377 
378 static LIST_HEAD(qdisc_stab_list);
379 static DEFINE_SPINLOCK(qdisc_stab_lock);
380 
381 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
382 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
383 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
384 };
385 
386 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
387 {
388 	struct nlattr *tb[TCA_STAB_MAX + 1];
389 	struct qdisc_size_table *stab;
390 	struct tc_sizespec *s;
391 	unsigned int tsize = 0;
392 	u16 *tab = NULL;
393 	int err;
394 
395 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
396 	if (err < 0)
397 		return ERR_PTR(err);
398 	if (!tb[TCA_STAB_BASE])
399 		return ERR_PTR(-EINVAL);
400 
401 	s = nla_data(tb[TCA_STAB_BASE]);
402 
403 	if (s->tsize > 0) {
404 		if (!tb[TCA_STAB_DATA])
405 			return ERR_PTR(-EINVAL);
406 		tab = nla_data(tb[TCA_STAB_DATA]);
407 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
408 	}
409 
410 	if (tsize != s->tsize || (!tab && tsize > 0))
411 		return ERR_PTR(-EINVAL);
412 
413 	spin_lock(&qdisc_stab_lock);
414 
415 	list_for_each_entry(stab, &qdisc_stab_list, list) {
416 		if (memcmp(&stab->szopts, s, sizeof(*s)))
417 			continue;
418 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
419 			continue;
420 		stab->refcnt++;
421 		spin_unlock(&qdisc_stab_lock);
422 		return stab;
423 	}
424 
425 	spin_unlock(&qdisc_stab_lock);
426 
427 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
428 	if (!stab)
429 		return ERR_PTR(-ENOMEM);
430 
431 	stab->refcnt = 1;
432 	stab->szopts = *s;
433 	if (tsize > 0)
434 		memcpy(stab->data, tab, tsize * sizeof(u16));
435 
436 	spin_lock(&qdisc_stab_lock);
437 	list_add_tail(&stab->list, &qdisc_stab_list);
438 	spin_unlock(&qdisc_stab_lock);
439 
440 	return stab;
441 }
442 
443 static void stab_kfree_rcu(struct rcu_head *head)
444 {
445 	kfree(container_of(head, struct qdisc_size_table, rcu));
446 }
447 
448 void qdisc_put_stab(struct qdisc_size_table *tab)
449 {
450 	if (!tab)
451 		return;
452 
453 	spin_lock(&qdisc_stab_lock);
454 
455 	if (--tab->refcnt == 0) {
456 		list_del(&tab->list);
457 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
458 	}
459 
460 	spin_unlock(&qdisc_stab_lock);
461 }
462 EXPORT_SYMBOL(qdisc_put_stab);
463 
464 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
465 {
466 	struct nlattr *nest;
467 
468 	nest = nla_nest_start(skb, TCA_STAB);
469 	if (nest == NULL)
470 		goto nla_put_failure;
471 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
472 		goto nla_put_failure;
473 	nla_nest_end(skb, nest);
474 
475 	return skb->len;
476 
477 nla_put_failure:
478 	return -1;
479 }
480 
481 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
482 {
483 	int pkt_len, slot;
484 
485 	pkt_len = skb->len + stab->szopts.overhead;
486 	if (unlikely(!stab->szopts.tsize))
487 		goto out;
488 
489 	slot = pkt_len + stab->szopts.cell_align;
490 	if (unlikely(slot < 0))
491 		slot = 0;
492 
493 	slot >>= stab->szopts.cell_log;
494 	if (likely(slot < stab->szopts.tsize))
495 		pkt_len = stab->data[slot];
496 	else
497 		pkt_len = stab->data[stab->szopts.tsize - 1] *
498 				(slot / stab->szopts.tsize) +
499 				stab->data[slot % stab->szopts.tsize];
500 
501 	pkt_len <<= stab->szopts.size_log;
502 out:
503 	if (unlikely(pkt_len < 1))
504 		pkt_len = 1;
505 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
506 }
507 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
508 
509 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
510 {
511 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
512 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
513 			txt, qdisc->ops->id, qdisc->handle >> 16);
514 		qdisc->flags |= TCQ_F_WARN_NONWC;
515 	}
516 }
517 EXPORT_SYMBOL(qdisc_warn_nonwc);
518 
519 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
520 {
521 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
522 						 timer);
523 
524 	qdisc_unthrottled(wd->qdisc);
525 	__netif_schedule(qdisc_root(wd->qdisc));
526 
527 	return HRTIMER_NORESTART;
528 }
529 
530 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
531 {
532 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
533 	wd->timer.function = qdisc_watchdog;
534 	wd->qdisc = qdisc;
535 }
536 EXPORT_SYMBOL(qdisc_watchdog_init);
537 
538 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
539 {
540 	if (test_bit(__QDISC_STATE_DEACTIVATED,
541 		     &qdisc_root_sleeping(wd->qdisc)->state))
542 		return;
543 
544 	qdisc_throttled(wd->qdisc);
545 
546 	hrtimer_start(&wd->timer,
547 		      ns_to_ktime(expires),
548 		      HRTIMER_MODE_ABS);
549 }
550 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
551 
552 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
553 {
554 	hrtimer_cancel(&wd->timer);
555 	qdisc_unthrottled(wd->qdisc);
556 }
557 EXPORT_SYMBOL(qdisc_watchdog_cancel);
558 
559 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
560 {
561 	unsigned int size = n * sizeof(struct hlist_head), i;
562 	struct hlist_head *h;
563 
564 	if (size <= PAGE_SIZE)
565 		h = kmalloc(size, GFP_KERNEL);
566 	else
567 		h = (struct hlist_head *)
568 			__get_free_pages(GFP_KERNEL, get_order(size));
569 
570 	if (h != NULL) {
571 		for (i = 0; i < n; i++)
572 			INIT_HLIST_HEAD(&h[i]);
573 	}
574 	return h;
575 }
576 
577 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
578 {
579 	unsigned int size = n * sizeof(struct hlist_head);
580 
581 	if (size <= PAGE_SIZE)
582 		kfree(h);
583 	else
584 		free_pages((unsigned long)h, get_order(size));
585 }
586 
587 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
588 {
589 	struct Qdisc_class_common *cl;
590 	struct hlist_node *next;
591 	struct hlist_head *nhash, *ohash;
592 	unsigned int nsize, nmask, osize;
593 	unsigned int i, h;
594 
595 	/* Rehash when load factor exceeds 0.75 */
596 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
597 		return;
598 	nsize = clhash->hashsize * 2;
599 	nmask = nsize - 1;
600 	nhash = qdisc_class_hash_alloc(nsize);
601 	if (nhash == NULL)
602 		return;
603 
604 	ohash = clhash->hash;
605 	osize = clhash->hashsize;
606 
607 	sch_tree_lock(sch);
608 	for (i = 0; i < osize; i++) {
609 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
610 			h = qdisc_class_hash(cl->classid, nmask);
611 			hlist_add_head(&cl->hnode, &nhash[h]);
612 		}
613 	}
614 	clhash->hash     = nhash;
615 	clhash->hashsize = nsize;
616 	clhash->hashmask = nmask;
617 	sch_tree_unlock(sch);
618 
619 	qdisc_class_hash_free(ohash, osize);
620 }
621 EXPORT_SYMBOL(qdisc_class_hash_grow);
622 
623 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
624 {
625 	unsigned int size = 4;
626 
627 	clhash->hash = qdisc_class_hash_alloc(size);
628 	if (clhash->hash == NULL)
629 		return -ENOMEM;
630 	clhash->hashsize  = size;
631 	clhash->hashmask  = size - 1;
632 	clhash->hashelems = 0;
633 	return 0;
634 }
635 EXPORT_SYMBOL(qdisc_class_hash_init);
636 
637 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
638 {
639 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
640 }
641 EXPORT_SYMBOL(qdisc_class_hash_destroy);
642 
643 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
644 			     struct Qdisc_class_common *cl)
645 {
646 	unsigned int h;
647 
648 	INIT_HLIST_NODE(&cl->hnode);
649 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
650 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
651 	clhash->hashelems++;
652 }
653 EXPORT_SYMBOL(qdisc_class_hash_insert);
654 
655 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
656 			     struct Qdisc_class_common *cl)
657 {
658 	hlist_del(&cl->hnode);
659 	clhash->hashelems--;
660 }
661 EXPORT_SYMBOL(qdisc_class_hash_remove);
662 
663 /* Allocate an unique handle from space managed by kernel
664  * Possible range is [8000-FFFF]:0000 (0x8000 values)
665  */
666 static u32 qdisc_alloc_handle(struct net_device *dev)
667 {
668 	int i = 0x8000;
669 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
670 
671 	do {
672 		autohandle += TC_H_MAKE(0x10000U, 0);
673 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
674 			autohandle = TC_H_MAKE(0x80000000U, 0);
675 		if (!qdisc_lookup(dev, autohandle))
676 			return autohandle;
677 		cond_resched();
678 	} while	(--i > 0);
679 
680 	return 0;
681 }
682 
683 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
684 {
685 	const struct Qdisc_class_ops *cops;
686 	unsigned long cl;
687 	u32 parentid;
688 
689 	if (n == 0)
690 		return;
691 	while ((parentid = sch->parent)) {
692 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
693 			return;
694 
695 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
696 		if (sch == NULL) {
697 			WARN_ON(parentid != TC_H_ROOT);
698 			return;
699 		}
700 		cops = sch->ops->cl_ops;
701 		if (cops->qlen_notify) {
702 			cl = cops->get(sch, parentid);
703 			cops->qlen_notify(sch, cl);
704 			cops->put(sch, cl);
705 		}
706 		sch->q.qlen -= n;
707 	}
708 }
709 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
710 
711 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
712 			       struct nlmsghdr *n, u32 clid,
713 			       struct Qdisc *old, struct Qdisc *new)
714 {
715 	if (new || old)
716 		qdisc_notify(net, skb, n, clid, old, new);
717 
718 	if (old)
719 		qdisc_destroy(old);
720 }
721 
722 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
723  * to device "dev".
724  *
725  * When appropriate send a netlink notification using 'skb'
726  * and "n".
727  *
728  * On success, destroy old qdisc.
729  */
730 
731 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
732 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
733 		       struct Qdisc *new, struct Qdisc *old)
734 {
735 	struct Qdisc *q = old;
736 	struct net *net = dev_net(dev);
737 	int err = 0;
738 
739 	if (parent == NULL) {
740 		unsigned int i, num_q, ingress;
741 
742 		ingress = 0;
743 		num_q = dev->num_tx_queues;
744 		if ((q && q->flags & TCQ_F_INGRESS) ||
745 		    (new && new->flags & TCQ_F_INGRESS)) {
746 			num_q = 1;
747 			ingress = 1;
748 			if (!dev_ingress_queue(dev))
749 				return -ENOENT;
750 		}
751 
752 		if (dev->flags & IFF_UP)
753 			dev_deactivate(dev);
754 
755 		if (new && new->ops->attach) {
756 			new->ops->attach(new);
757 			num_q = 0;
758 		}
759 
760 		for (i = 0; i < num_q; i++) {
761 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
762 
763 			if (!ingress)
764 				dev_queue = netdev_get_tx_queue(dev, i);
765 
766 			old = dev_graft_qdisc(dev_queue, new);
767 			if (new && i > 0)
768 				atomic_inc(&new->refcnt);
769 
770 			if (!ingress)
771 				qdisc_destroy(old);
772 		}
773 
774 		if (!ingress) {
775 			notify_and_destroy(net, skb, n, classid,
776 					   dev->qdisc, new);
777 			if (new && !new->ops->attach)
778 				atomic_inc(&new->refcnt);
779 			dev->qdisc = new ? : &noop_qdisc;
780 		} else {
781 			notify_and_destroy(net, skb, n, classid, old, new);
782 		}
783 
784 		if (dev->flags & IFF_UP)
785 			dev_activate(dev);
786 	} else {
787 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
788 
789 		err = -EOPNOTSUPP;
790 		if (cops && cops->graft) {
791 			unsigned long cl = cops->get(parent, classid);
792 			if (cl) {
793 				err = cops->graft(parent, cl, new, &old);
794 				cops->put(parent, cl);
795 			} else
796 				err = -ENOENT;
797 		}
798 		if (!err)
799 			notify_and_destroy(net, skb, n, classid, old, new);
800 	}
801 	return err;
802 }
803 
804 /* lockdep annotation is needed for ingress; egress gets it only for name */
805 static struct lock_class_key qdisc_tx_lock;
806 static struct lock_class_key qdisc_rx_lock;
807 
808 /*
809    Allocate and initialize new qdisc.
810 
811    Parameters are passed via opt.
812  */
813 
814 static struct Qdisc *
815 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
816 	     struct Qdisc *p, u32 parent, u32 handle,
817 	     struct nlattr **tca, int *errp)
818 {
819 	int err;
820 	struct nlattr *kind = tca[TCA_KIND];
821 	struct Qdisc *sch;
822 	struct Qdisc_ops *ops;
823 	struct qdisc_size_table *stab;
824 
825 	ops = qdisc_lookup_ops(kind);
826 #ifdef CONFIG_MODULES
827 	if (ops == NULL && kind != NULL) {
828 		char name[IFNAMSIZ];
829 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
830 			/* We dropped the RTNL semaphore in order to
831 			 * perform the module load.  So, even if we
832 			 * succeeded in loading the module we have to
833 			 * tell the caller to replay the request.  We
834 			 * indicate this using -EAGAIN.
835 			 * We replay the request because the device may
836 			 * go away in the mean time.
837 			 */
838 			rtnl_unlock();
839 			request_module("sch_%s", name);
840 			rtnl_lock();
841 			ops = qdisc_lookup_ops(kind);
842 			if (ops != NULL) {
843 				/* We will try again qdisc_lookup_ops,
844 				 * so don't keep a reference.
845 				 */
846 				module_put(ops->owner);
847 				err = -EAGAIN;
848 				goto err_out;
849 			}
850 		}
851 	}
852 #endif
853 
854 	err = -ENOENT;
855 	if (ops == NULL)
856 		goto err_out;
857 
858 	sch = qdisc_alloc(dev_queue, ops);
859 	if (IS_ERR(sch)) {
860 		err = PTR_ERR(sch);
861 		goto err_out2;
862 	}
863 
864 	sch->parent = parent;
865 
866 	if (handle == TC_H_INGRESS) {
867 		sch->flags |= TCQ_F_INGRESS;
868 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
869 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
870 	} else {
871 		if (handle == 0) {
872 			handle = qdisc_alloc_handle(dev);
873 			err = -ENOMEM;
874 			if (handle == 0)
875 				goto err_out3;
876 		}
877 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
878 		if (!netif_is_multiqueue(dev))
879 			sch->flags |= TCQ_F_ONETXQUEUE;
880 	}
881 
882 	sch->handle = handle;
883 
884 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
885 		if (tca[TCA_STAB]) {
886 			stab = qdisc_get_stab(tca[TCA_STAB]);
887 			if (IS_ERR(stab)) {
888 				err = PTR_ERR(stab);
889 				goto err_out4;
890 			}
891 			rcu_assign_pointer(sch->stab, stab);
892 		}
893 		if (tca[TCA_RATE]) {
894 			spinlock_t *root_lock;
895 
896 			err = -EOPNOTSUPP;
897 			if (sch->flags & TCQ_F_MQROOT)
898 				goto err_out4;
899 
900 			if ((sch->parent != TC_H_ROOT) &&
901 			    !(sch->flags & TCQ_F_INGRESS) &&
902 			    (!p || !(p->flags & TCQ_F_MQROOT)))
903 				root_lock = qdisc_root_sleeping_lock(sch);
904 			else
905 				root_lock = qdisc_lock(sch);
906 
907 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
908 						root_lock, tca[TCA_RATE]);
909 			if (err)
910 				goto err_out4;
911 		}
912 
913 		qdisc_list_add(sch);
914 
915 		return sch;
916 	}
917 err_out3:
918 	dev_put(dev);
919 	kfree((char *) sch - sch->padded);
920 err_out2:
921 	module_put(ops->owner);
922 err_out:
923 	*errp = err;
924 	return NULL;
925 
926 err_out4:
927 	/*
928 	 * Any broken qdiscs that would require a ops->reset() here?
929 	 * The qdisc was never in action so it shouldn't be necessary.
930 	 */
931 	qdisc_put_stab(rtnl_dereference(sch->stab));
932 	if (ops->destroy)
933 		ops->destroy(sch);
934 	goto err_out3;
935 }
936 
937 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
938 {
939 	struct qdisc_size_table *ostab, *stab = NULL;
940 	int err = 0;
941 
942 	if (tca[TCA_OPTIONS]) {
943 		if (sch->ops->change == NULL)
944 			return -EINVAL;
945 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
946 		if (err)
947 			return err;
948 	}
949 
950 	if (tca[TCA_STAB]) {
951 		stab = qdisc_get_stab(tca[TCA_STAB]);
952 		if (IS_ERR(stab))
953 			return PTR_ERR(stab);
954 	}
955 
956 	ostab = rtnl_dereference(sch->stab);
957 	rcu_assign_pointer(sch->stab, stab);
958 	qdisc_put_stab(ostab);
959 
960 	if (tca[TCA_RATE]) {
961 		/* NB: ignores errors from replace_estimator
962 		   because change can't be undone. */
963 		if (sch->flags & TCQ_F_MQROOT)
964 			goto out;
965 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
966 					    qdisc_root_sleeping_lock(sch),
967 					    tca[TCA_RATE]);
968 	}
969 out:
970 	return 0;
971 }
972 
973 struct check_loop_arg {
974 	struct qdisc_walker	w;
975 	struct Qdisc		*p;
976 	int			depth;
977 };
978 
979 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
980 
981 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
982 {
983 	struct check_loop_arg	arg;
984 
985 	if (q->ops->cl_ops == NULL)
986 		return 0;
987 
988 	arg.w.stop = arg.w.skip = arg.w.count = 0;
989 	arg.w.fn = check_loop_fn;
990 	arg.depth = depth;
991 	arg.p = p;
992 	q->ops->cl_ops->walk(q, &arg.w);
993 	return arg.w.stop ? -ELOOP : 0;
994 }
995 
996 static int
997 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
998 {
999 	struct Qdisc *leaf;
1000 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1001 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1002 
1003 	leaf = cops->leaf(q, cl);
1004 	if (leaf) {
1005 		if (leaf == arg->p || arg->depth > 7)
1006 			return -ELOOP;
1007 		return check_loop(leaf, arg->p, arg->depth + 1);
1008 	}
1009 	return 0;
1010 }
1011 
1012 /*
1013  * Delete/get qdisc.
1014  */
1015 
1016 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1017 {
1018 	struct net *net = sock_net(skb->sk);
1019 	struct tcmsg *tcm = nlmsg_data(n);
1020 	struct nlattr *tca[TCA_MAX + 1];
1021 	struct net_device *dev;
1022 	u32 clid;
1023 	struct Qdisc *q = NULL;
1024 	struct Qdisc *p = NULL;
1025 	int err;
1026 
1027 	if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1028 		return -EPERM;
1029 
1030 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1031 	if (err < 0)
1032 		return err;
1033 
1034 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1035 	if (!dev)
1036 		return -ENODEV;
1037 
1038 	clid = tcm->tcm_parent;
1039 	if (clid) {
1040 		if (clid != TC_H_ROOT) {
1041 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1042 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1043 				if (!p)
1044 					return -ENOENT;
1045 				q = qdisc_leaf(p, clid);
1046 			} else if (dev_ingress_queue(dev)) {
1047 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1048 			}
1049 		} else {
1050 			q = dev->qdisc;
1051 		}
1052 		if (!q)
1053 			return -ENOENT;
1054 
1055 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1056 			return -EINVAL;
1057 	} else {
1058 		q = qdisc_lookup(dev, tcm->tcm_handle);
1059 		if (!q)
1060 			return -ENOENT;
1061 	}
1062 
1063 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1064 		return -EINVAL;
1065 
1066 	if (n->nlmsg_type == RTM_DELQDISC) {
1067 		if (!clid)
1068 			return -EINVAL;
1069 		if (q->handle == 0)
1070 			return -ENOENT;
1071 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1072 		if (err != 0)
1073 			return err;
1074 	} else {
1075 		qdisc_notify(net, skb, n, clid, NULL, q);
1076 	}
1077 	return 0;
1078 }
1079 
1080 /*
1081  * Create/change qdisc.
1082  */
1083 
1084 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1085 {
1086 	struct net *net = sock_net(skb->sk);
1087 	struct tcmsg *tcm;
1088 	struct nlattr *tca[TCA_MAX + 1];
1089 	struct net_device *dev;
1090 	u32 clid;
1091 	struct Qdisc *q, *p;
1092 	int err;
1093 
1094 	if (!capable(CAP_NET_ADMIN))
1095 		return -EPERM;
1096 
1097 replay:
1098 	/* Reinit, just in case something touches this. */
1099 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1100 	if (err < 0)
1101 		return err;
1102 
1103 	tcm = nlmsg_data(n);
1104 	clid = tcm->tcm_parent;
1105 	q = p = NULL;
1106 
1107 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1108 	if (!dev)
1109 		return -ENODEV;
1110 
1111 
1112 	if (clid) {
1113 		if (clid != TC_H_ROOT) {
1114 			if (clid != TC_H_INGRESS) {
1115 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1116 				if (!p)
1117 					return -ENOENT;
1118 				q = qdisc_leaf(p, clid);
1119 			} else if (dev_ingress_queue_create(dev)) {
1120 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1121 			}
1122 		} else {
1123 			q = dev->qdisc;
1124 		}
1125 
1126 		/* It may be default qdisc, ignore it */
1127 		if (q && q->handle == 0)
1128 			q = NULL;
1129 
1130 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1131 			if (tcm->tcm_handle) {
1132 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1133 					return -EEXIST;
1134 				if (TC_H_MIN(tcm->tcm_handle))
1135 					return -EINVAL;
1136 				q = qdisc_lookup(dev, tcm->tcm_handle);
1137 				if (!q)
1138 					goto create_n_graft;
1139 				if (n->nlmsg_flags & NLM_F_EXCL)
1140 					return -EEXIST;
1141 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1142 					return -EINVAL;
1143 				if (q == p ||
1144 				    (p && check_loop(q, p, 0)))
1145 					return -ELOOP;
1146 				atomic_inc(&q->refcnt);
1147 				goto graft;
1148 			} else {
1149 				if (!q)
1150 					goto create_n_graft;
1151 
1152 				/* This magic test requires explanation.
1153 				 *
1154 				 *   We know, that some child q is already
1155 				 *   attached to this parent and have choice:
1156 				 *   either to change it or to create/graft new one.
1157 				 *
1158 				 *   1. We are allowed to create/graft only
1159 				 *   if CREATE and REPLACE flags are set.
1160 				 *
1161 				 *   2. If EXCL is set, requestor wanted to say,
1162 				 *   that qdisc tcm_handle is not expected
1163 				 *   to exist, so that we choose create/graft too.
1164 				 *
1165 				 *   3. The last case is when no flags are set.
1166 				 *   Alas, it is sort of hole in API, we
1167 				 *   cannot decide what to do unambiguously.
1168 				 *   For now we select create/graft, if
1169 				 *   user gave KIND, which does not match existing.
1170 				 */
1171 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1172 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1173 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1174 				     (tca[TCA_KIND] &&
1175 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1176 					goto create_n_graft;
1177 			}
1178 		}
1179 	} else {
1180 		if (!tcm->tcm_handle)
1181 			return -EINVAL;
1182 		q = qdisc_lookup(dev, tcm->tcm_handle);
1183 	}
1184 
1185 	/* Change qdisc parameters */
1186 	if (q == NULL)
1187 		return -ENOENT;
1188 	if (n->nlmsg_flags & NLM_F_EXCL)
1189 		return -EEXIST;
1190 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1191 		return -EINVAL;
1192 	err = qdisc_change(q, tca);
1193 	if (err == 0)
1194 		qdisc_notify(net, skb, n, clid, NULL, q);
1195 	return err;
1196 
1197 create_n_graft:
1198 	if (!(n->nlmsg_flags & NLM_F_CREATE))
1199 		return -ENOENT;
1200 	if (clid == TC_H_INGRESS) {
1201 		if (dev_ingress_queue(dev))
1202 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1203 					 tcm->tcm_parent, tcm->tcm_parent,
1204 					 tca, &err);
1205 		else
1206 			err = -ENOENT;
1207 	} else {
1208 		struct netdev_queue *dev_queue;
1209 
1210 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1211 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1212 		else if (p)
1213 			dev_queue = p->dev_queue;
1214 		else
1215 			dev_queue = netdev_get_tx_queue(dev, 0);
1216 
1217 		q = qdisc_create(dev, dev_queue, p,
1218 				 tcm->tcm_parent, tcm->tcm_handle,
1219 				 tca, &err);
1220 	}
1221 	if (q == NULL) {
1222 		if (err == -EAGAIN)
1223 			goto replay;
1224 		return err;
1225 	}
1226 
1227 graft:
1228 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1229 	if (err) {
1230 		if (q)
1231 			qdisc_destroy(q);
1232 		return err;
1233 	}
1234 
1235 	return 0;
1236 }
1237 
1238 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1239 			 u32 portid, u32 seq, u16 flags, int event)
1240 {
1241 	struct tcmsg *tcm;
1242 	struct nlmsghdr  *nlh;
1243 	unsigned char *b = skb_tail_pointer(skb);
1244 	struct gnet_dump d;
1245 	struct qdisc_size_table *stab;
1246 
1247 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1248 	if (!nlh)
1249 		goto out_nlmsg_trim;
1250 	tcm = nlmsg_data(nlh);
1251 	tcm->tcm_family = AF_UNSPEC;
1252 	tcm->tcm__pad1 = 0;
1253 	tcm->tcm__pad2 = 0;
1254 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1255 	tcm->tcm_parent = clid;
1256 	tcm->tcm_handle = q->handle;
1257 	tcm->tcm_info = atomic_read(&q->refcnt);
1258 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1259 		goto nla_put_failure;
1260 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1261 		goto nla_put_failure;
1262 	q->qstats.qlen = q->q.qlen;
1263 
1264 	stab = rtnl_dereference(q->stab);
1265 	if (stab && qdisc_dump_stab(skb, stab) < 0)
1266 		goto nla_put_failure;
1267 
1268 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1269 					 qdisc_root_sleeping_lock(q), &d) < 0)
1270 		goto nla_put_failure;
1271 
1272 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1273 		goto nla_put_failure;
1274 
1275 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1276 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1277 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1278 		goto nla_put_failure;
1279 
1280 	if (gnet_stats_finish_copy(&d) < 0)
1281 		goto nla_put_failure;
1282 
1283 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1284 	return skb->len;
1285 
1286 out_nlmsg_trim:
1287 nla_put_failure:
1288 	nlmsg_trim(skb, b);
1289 	return -1;
1290 }
1291 
1292 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1293 {
1294 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1295 }
1296 
1297 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1298 			struct nlmsghdr *n, u32 clid,
1299 			struct Qdisc *old, struct Qdisc *new)
1300 {
1301 	struct sk_buff *skb;
1302 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1303 
1304 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1305 	if (!skb)
1306 		return -ENOBUFS;
1307 
1308 	if (old && !tc_qdisc_dump_ignore(old)) {
1309 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1310 				  0, RTM_DELQDISC) < 0)
1311 			goto err_out;
1312 	}
1313 	if (new && !tc_qdisc_dump_ignore(new)) {
1314 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1315 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1316 			goto err_out;
1317 	}
1318 
1319 	if (skb->len)
1320 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1321 				      n->nlmsg_flags & NLM_F_ECHO);
1322 
1323 err_out:
1324 	kfree_skb(skb);
1325 	return -EINVAL;
1326 }
1327 
1328 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1329 			      struct netlink_callback *cb,
1330 			      int *q_idx_p, int s_q_idx)
1331 {
1332 	int ret = 0, q_idx = *q_idx_p;
1333 	struct Qdisc *q;
1334 
1335 	if (!root)
1336 		return 0;
1337 
1338 	q = root;
1339 	if (q_idx < s_q_idx) {
1340 		q_idx++;
1341 	} else {
1342 		if (!tc_qdisc_dump_ignore(q) &&
1343 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1344 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1345 			goto done;
1346 		q_idx++;
1347 	}
1348 	list_for_each_entry(q, &root->list, list) {
1349 		if (q_idx < s_q_idx) {
1350 			q_idx++;
1351 			continue;
1352 		}
1353 		if (!tc_qdisc_dump_ignore(q) &&
1354 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1355 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1356 			goto done;
1357 		q_idx++;
1358 	}
1359 
1360 out:
1361 	*q_idx_p = q_idx;
1362 	return ret;
1363 done:
1364 	ret = -1;
1365 	goto out;
1366 }
1367 
1368 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1369 {
1370 	struct net *net = sock_net(skb->sk);
1371 	int idx, q_idx;
1372 	int s_idx, s_q_idx;
1373 	struct net_device *dev;
1374 
1375 	s_idx = cb->args[0];
1376 	s_q_idx = q_idx = cb->args[1];
1377 
1378 	rcu_read_lock();
1379 	idx = 0;
1380 	for_each_netdev_rcu(net, dev) {
1381 		struct netdev_queue *dev_queue;
1382 
1383 		if (idx < s_idx)
1384 			goto cont;
1385 		if (idx > s_idx)
1386 			s_q_idx = 0;
1387 		q_idx = 0;
1388 
1389 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1390 			goto done;
1391 
1392 		dev_queue = dev_ingress_queue(dev);
1393 		if (dev_queue &&
1394 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1395 				       &q_idx, s_q_idx) < 0)
1396 			goto done;
1397 
1398 cont:
1399 		idx++;
1400 	}
1401 
1402 done:
1403 	rcu_read_unlock();
1404 
1405 	cb->args[0] = idx;
1406 	cb->args[1] = q_idx;
1407 
1408 	return skb->len;
1409 }
1410 
1411 
1412 
1413 /************************************************
1414  *	Traffic classes manipulation.		*
1415  ************************************************/
1416 
1417 
1418 
1419 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1420 {
1421 	struct net *net = sock_net(skb->sk);
1422 	struct tcmsg *tcm = nlmsg_data(n);
1423 	struct nlattr *tca[TCA_MAX + 1];
1424 	struct net_device *dev;
1425 	struct Qdisc *q = NULL;
1426 	const struct Qdisc_class_ops *cops;
1427 	unsigned long cl = 0;
1428 	unsigned long new_cl;
1429 	u32 portid;
1430 	u32 clid;
1431 	u32 qid;
1432 	int err;
1433 
1434 	if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1435 		return -EPERM;
1436 
1437 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1438 	if (err < 0)
1439 		return err;
1440 
1441 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1442 	if (!dev)
1443 		return -ENODEV;
1444 
1445 	/*
1446 	   parent == TC_H_UNSPEC - unspecified parent.
1447 	   parent == TC_H_ROOT   - class is root, which has no parent.
1448 	   parent == X:0	 - parent is root class.
1449 	   parent == X:Y	 - parent is a node in hierarchy.
1450 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1451 
1452 	   handle == 0:0	 - generate handle from kernel pool.
1453 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1454 	   handle == X:Y	 - clear.
1455 	   handle == X:0	 - root class.
1456 	 */
1457 
1458 	/* Step 1. Determine qdisc handle X:0 */
1459 
1460 	portid = tcm->tcm_parent;
1461 	clid = tcm->tcm_handle;
1462 	qid = TC_H_MAJ(clid);
1463 
1464 	if (portid != TC_H_ROOT) {
1465 		u32 qid1 = TC_H_MAJ(portid);
1466 
1467 		if (qid && qid1) {
1468 			/* If both majors are known, they must be identical. */
1469 			if (qid != qid1)
1470 				return -EINVAL;
1471 		} else if (qid1) {
1472 			qid = qid1;
1473 		} else if (qid == 0)
1474 			qid = dev->qdisc->handle;
1475 
1476 		/* Now qid is genuine qdisc handle consistent
1477 		 * both with parent and child.
1478 		 *
1479 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1480 		 */
1481 		if (portid)
1482 			portid = TC_H_MAKE(qid, portid);
1483 	} else {
1484 		if (qid == 0)
1485 			qid = dev->qdisc->handle;
1486 	}
1487 
1488 	/* OK. Locate qdisc */
1489 	q = qdisc_lookup(dev, qid);
1490 	if (!q)
1491 		return -ENOENT;
1492 
1493 	/* An check that it supports classes */
1494 	cops = q->ops->cl_ops;
1495 	if (cops == NULL)
1496 		return -EINVAL;
1497 
1498 	/* Now try to get class */
1499 	if (clid == 0) {
1500 		if (portid == TC_H_ROOT)
1501 			clid = qid;
1502 	} else
1503 		clid = TC_H_MAKE(qid, clid);
1504 
1505 	if (clid)
1506 		cl = cops->get(q, clid);
1507 
1508 	if (cl == 0) {
1509 		err = -ENOENT;
1510 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1511 		    !(n->nlmsg_flags & NLM_F_CREATE))
1512 			goto out;
1513 	} else {
1514 		switch (n->nlmsg_type) {
1515 		case RTM_NEWTCLASS:
1516 			err = -EEXIST;
1517 			if (n->nlmsg_flags & NLM_F_EXCL)
1518 				goto out;
1519 			break;
1520 		case RTM_DELTCLASS:
1521 			err = -EOPNOTSUPP;
1522 			if (cops->delete)
1523 				err = cops->delete(q, cl);
1524 			if (err == 0)
1525 				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1526 			goto out;
1527 		case RTM_GETTCLASS:
1528 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1529 			goto out;
1530 		default:
1531 			err = -EINVAL;
1532 			goto out;
1533 		}
1534 	}
1535 
1536 	new_cl = cl;
1537 	err = -EOPNOTSUPP;
1538 	if (cops->change)
1539 		err = cops->change(q, clid, portid, tca, &new_cl);
1540 	if (err == 0)
1541 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1542 
1543 out:
1544 	if (cl)
1545 		cops->put(q, cl);
1546 
1547 	return err;
1548 }
1549 
1550 
1551 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1552 			  unsigned long cl,
1553 			  u32 portid, u32 seq, u16 flags, int event)
1554 {
1555 	struct tcmsg *tcm;
1556 	struct nlmsghdr  *nlh;
1557 	unsigned char *b = skb_tail_pointer(skb);
1558 	struct gnet_dump d;
1559 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1560 
1561 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1562 	if (!nlh)
1563 		goto out_nlmsg_trim;
1564 	tcm = nlmsg_data(nlh);
1565 	tcm->tcm_family = AF_UNSPEC;
1566 	tcm->tcm__pad1 = 0;
1567 	tcm->tcm__pad2 = 0;
1568 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1569 	tcm->tcm_parent = q->handle;
1570 	tcm->tcm_handle = q->handle;
1571 	tcm->tcm_info = 0;
1572 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1573 		goto nla_put_failure;
1574 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1575 		goto nla_put_failure;
1576 
1577 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1578 					 qdisc_root_sleeping_lock(q), &d) < 0)
1579 		goto nla_put_failure;
1580 
1581 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1582 		goto nla_put_failure;
1583 
1584 	if (gnet_stats_finish_copy(&d) < 0)
1585 		goto nla_put_failure;
1586 
1587 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1588 	return skb->len;
1589 
1590 out_nlmsg_trim:
1591 nla_put_failure:
1592 	nlmsg_trim(skb, b);
1593 	return -1;
1594 }
1595 
1596 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1597 			 struct nlmsghdr *n, struct Qdisc *q,
1598 			 unsigned long cl, int event)
1599 {
1600 	struct sk_buff *skb;
1601 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1602 
1603 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1604 	if (!skb)
1605 		return -ENOBUFS;
1606 
1607 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1608 		kfree_skb(skb);
1609 		return -EINVAL;
1610 	}
1611 
1612 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1613 			      n->nlmsg_flags & NLM_F_ECHO);
1614 }
1615 
1616 struct qdisc_dump_args {
1617 	struct qdisc_walker	w;
1618 	struct sk_buff		*skb;
1619 	struct netlink_callback	*cb;
1620 };
1621 
1622 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1623 {
1624 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1625 
1626 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1627 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1628 }
1629 
1630 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1631 				struct tcmsg *tcm, struct netlink_callback *cb,
1632 				int *t_p, int s_t)
1633 {
1634 	struct qdisc_dump_args arg;
1635 
1636 	if (tc_qdisc_dump_ignore(q) ||
1637 	    *t_p < s_t || !q->ops->cl_ops ||
1638 	    (tcm->tcm_parent &&
1639 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1640 		(*t_p)++;
1641 		return 0;
1642 	}
1643 	if (*t_p > s_t)
1644 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1645 	arg.w.fn = qdisc_class_dump;
1646 	arg.skb = skb;
1647 	arg.cb = cb;
1648 	arg.w.stop  = 0;
1649 	arg.w.skip = cb->args[1];
1650 	arg.w.count = 0;
1651 	q->ops->cl_ops->walk(q, &arg.w);
1652 	cb->args[1] = arg.w.count;
1653 	if (arg.w.stop)
1654 		return -1;
1655 	(*t_p)++;
1656 	return 0;
1657 }
1658 
1659 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1660 			       struct tcmsg *tcm, struct netlink_callback *cb,
1661 			       int *t_p, int s_t)
1662 {
1663 	struct Qdisc *q;
1664 
1665 	if (!root)
1666 		return 0;
1667 
1668 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1669 		return -1;
1670 
1671 	list_for_each_entry(q, &root->list, list) {
1672 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1673 			return -1;
1674 	}
1675 
1676 	return 0;
1677 }
1678 
1679 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1680 {
1681 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1682 	struct net *net = sock_net(skb->sk);
1683 	struct netdev_queue *dev_queue;
1684 	struct net_device *dev;
1685 	int t, s_t;
1686 
1687 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1688 		return 0;
1689 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1690 	if (!dev)
1691 		return 0;
1692 
1693 	s_t = cb->args[0];
1694 	t = 0;
1695 
1696 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1697 		goto done;
1698 
1699 	dev_queue = dev_ingress_queue(dev);
1700 	if (dev_queue &&
1701 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1702 				&t, s_t) < 0)
1703 		goto done;
1704 
1705 done:
1706 	cb->args[0] = t;
1707 
1708 	dev_put(dev);
1709 	return skb->len;
1710 }
1711 
1712 /* Main classifier routine: scans classifier chain attached
1713  * to this qdisc, (optionally) tests for protocol and asks
1714  * specific classifiers.
1715  */
1716 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1717 		       struct tcf_result *res)
1718 {
1719 	__be16 protocol = skb->protocol;
1720 	int err;
1721 
1722 	for (; tp; tp = tp->next) {
1723 		if (tp->protocol != protocol &&
1724 		    tp->protocol != htons(ETH_P_ALL))
1725 			continue;
1726 		err = tp->classify(skb, tp, res);
1727 
1728 		if (err >= 0) {
1729 #ifdef CONFIG_NET_CLS_ACT
1730 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1731 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1732 #endif
1733 			return err;
1734 		}
1735 	}
1736 	return -1;
1737 }
1738 EXPORT_SYMBOL(tc_classify_compat);
1739 
1740 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1741 		struct tcf_result *res)
1742 {
1743 	int err = 0;
1744 #ifdef CONFIG_NET_CLS_ACT
1745 	const struct tcf_proto *otp = tp;
1746 reclassify:
1747 #endif
1748 
1749 	err = tc_classify_compat(skb, tp, res);
1750 #ifdef CONFIG_NET_CLS_ACT
1751 	if (err == TC_ACT_RECLASSIFY) {
1752 		u32 verd = G_TC_VERD(skb->tc_verd);
1753 		tp = otp;
1754 
1755 		if (verd++ >= MAX_REC_LOOP) {
1756 			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1757 					       tp->q->ops->id,
1758 					       tp->prio & 0xffff,
1759 					       ntohs(tp->protocol));
1760 			return TC_ACT_SHOT;
1761 		}
1762 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1763 		goto reclassify;
1764 	}
1765 #endif
1766 	return err;
1767 }
1768 EXPORT_SYMBOL(tc_classify);
1769 
1770 void tcf_destroy(struct tcf_proto *tp)
1771 {
1772 	tp->ops->destroy(tp);
1773 	module_put(tp->ops->owner);
1774 	kfree(tp);
1775 }
1776 
1777 void tcf_destroy_chain(struct tcf_proto **fl)
1778 {
1779 	struct tcf_proto *tp;
1780 
1781 	while ((tp = *fl) != NULL) {
1782 		*fl = tp->next;
1783 		tcf_destroy(tp);
1784 	}
1785 }
1786 EXPORT_SYMBOL(tcf_destroy_chain);
1787 
1788 #ifdef CONFIG_PROC_FS
1789 static int psched_show(struct seq_file *seq, void *v)
1790 {
1791 	struct timespec ts;
1792 
1793 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1794 	seq_printf(seq, "%08x %08x %08x %08x\n",
1795 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1796 		   1000000,
1797 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1798 
1799 	return 0;
1800 }
1801 
1802 static int psched_open(struct inode *inode, struct file *file)
1803 {
1804 	return single_open(file, psched_show, NULL);
1805 }
1806 
1807 static const struct file_operations psched_fops = {
1808 	.owner = THIS_MODULE,
1809 	.open = psched_open,
1810 	.read  = seq_read,
1811 	.llseek = seq_lseek,
1812 	.release = single_release,
1813 };
1814 
1815 static int __net_init psched_net_init(struct net *net)
1816 {
1817 	struct proc_dir_entry *e;
1818 
1819 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1820 	if (e == NULL)
1821 		return -ENOMEM;
1822 
1823 	return 0;
1824 }
1825 
1826 static void __net_exit psched_net_exit(struct net *net)
1827 {
1828 	remove_proc_entry("psched", net->proc_net);
1829 }
1830 #else
1831 static int __net_init psched_net_init(struct net *net)
1832 {
1833 	return 0;
1834 }
1835 
1836 static void __net_exit psched_net_exit(struct net *net)
1837 {
1838 }
1839 #endif
1840 
1841 static struct pernet_operations psched_net_ops = {
1842 	.init = psched_net_init,
1843 	.exit = psched_net_exit,
1844 };
1845 
1846 static int __init pktsched_init(void)
1847 {
1848 	int err;
1849 
1850 	err = register_pernet_subsys(&psched_net_ops);
1851 	if (err) {
1852 		pr_err("pktsched_init: "
1853 		       "cannot initialize per netns operations\n");
1854 		return err;
1855 	}
1856 
1857 	register_qdisc(&pfifo_qdisc_ops);
1858 	register_qdisc(&bfifo_qdisc_ops);
1859 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1860 	register_qdisc(&mq_qdisc_ops);
1861 
1862 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1863 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1864 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1865 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1866 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1867 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1868 
1869 	return 0;
1870 }
1871 
1872 subsys_initcall(pktsched_init);
1873