xref: /openbmc/linux/net/sched/sch_generic.c (revision d7811e62)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Main qdisc structure lock.
40 
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44 
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53 
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57 
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60 	write_lock_bh(&qdisc_tree_lock);
61 	spin_lock_bh(&dev->queue_lock);
62 }
63 
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66 	spin_unlock_bh(&dev->queue_lock);
67 	write_unlock_bh(&qdisc_tree_lock);
68 }
69 
70 /*
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73 
74    netif_tx_lock serializes accesses to device driver.
75 
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79 
80 
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84 
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88 
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91 
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94 	struct Qdisc *q = dev->qdisc;
95 	struct sk_buff *skb;
96 
97 	/* Dequeue packet */
98 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99 		unsigned nolock = (dev->features & NETIF_F_LLTX);
100 
101 		dev->gso_skb = NULL;
102 
103 		/*
104 		 * When the driver has LLTX set it does its own locking
105 		 * in start_xmit. No need to add additional overhead by
106 		 * locking again. These checks are worth it because
107 		 * even uncongested locks can be quite expensive.
108 		 * The driver can do trylock like here too, in case
109 		 * of lock congestion it should return -1 and the packet
110 		 * will be requeued.
111 		 */
112 		if (!nolock) {
113 			if (!netif_tx_trylock(dev)) {
114 			collision:
115 				/* So, someone grabbed the driver. */
116 
117 				/* It may be transient configuration error,
118 				   when hard_start_xmit() recurses. We detect
119 				   it by checking xmit owner and drop the
120 				   packet when deadloop is detected.
121 				*/
122 				if (dev->xmit_lock_owner == smp_processor_id()) {
123 					kfree_skb(skb);
124 					if (net_ratelimit())
125 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126 					return -1;
127 				}
128 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
129 				goto requeue;
130 			}
131 		}
132 
133 		{
134 			/* And release queue */
135 			spin_unlock(&dev->queue_lock);
136 
137 			if (!netif_queue_stopped(dev)) {
138 				int ret;
139 
140 				ret = dev_hard_start_xmit(skb, dev);
141 				if (ret == NETDEV_TX_OK) {
142 					if (!nolock) {
143 						netif_tx_unlock(dev);
144 					}
145 					spin_lock(&dev->queue_lock);
146 					return -1;
147 				}
148 				if (ret == NETDEV_TX_LOCKED && nolock) {
149 					spin_lock(&dev->queue_lock);
150 					goto collision;
151 				}
152 			}
153 
154 			/* NETDEV_TX_BUSY - we need to requeue */
155 			/* Release the driver */
156 			if (!nolock) {
157 				netif_tx_unlock(dev);
158 			}
159 			spin_lock(&dev->queue_lock);
160 			q = dev->qdisc;
161 		}
162 
163 		/* Device kicked us out :(
164 		   This is possible in three cases:
165 
166 		   0. driver is locked
167 		   1. fastroute is enabled
168 		   2. device cannot determine busy state
169 		      before start of transmission (f.e. dialout)
170 		   3. device is buggy (ppp)
171 		 */
172 
173 requeue:
174 		if (skb->next)
175 			dev->gso_skb = skb;
176 		else
177 			q->ops->requeue(skb, q);
178 		netif_schedule(dev);
179 		return 1;
180 	}
181 	BUG_ON((int) q->q.qlen < 0);
182 	return q->q.qlen;
183 }
184 
185 void __qdisc_run(struct net_device *dev)
186 {
187 	if (unlikely(dev->qdisc == &noop_qdisc))
188 		goto out;
189 
190 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191 		/* NOTHING */;
192 
193 out:
194 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196 
197 static void dev_watchdog(unsigned long arg)
198 {
199 	struct net_device *dev = (struct net_device *)arg;
200 
201 	netif_tx_lock(dev);
202 	if (dev->qdisc != &noop_qdisc) {
203 		if (netif_device_present(dev) &&
204 		    netif_running(dev) &&
205 		    netif_carrier_ok(dev)) {
206 			if (netif_queue_stopped(dev) &&
207 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208 
209 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210 				       dev->name);
211 				dev->tx_timeout(dev);
212 			}
213 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214 				dev_hold(dev);
215 		}
216 	}
217 	netif_tx_unlock(dev);
218 
219 	dev_put(dev);
220 }
221 
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224 	init_timer(&dev->watchdog_timer);
225 	dev->watchdog_timer.data = (unsigned long)dev;
226 	dev->watchdog_timer.function = dev_watchdog;
227 }
228 
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231 	if (dev->tx_timeout) {
232 		if (dev->watchdog_timeo <= 0)
233 			dev->watchdog_timeo = 5*HZ;
234 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235 			dev_hold(dev);
236 	}
237 }
238 
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241 	__netdev_watchdog_up(dev);
242 }
243 
244 static void dev_watchdog_down(struct net_device *dev)
245 {
246 	netif_tx_lock_bh(dev);
247 	if (del_timer(&dev->watchdog_timer))
248 		dev_put(dev);
249 	netif_tx_unlock_bh(dev);
250 }
251 
252 void netif_carrier_on(struct net_device *dev)
253 {
254 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
255 		linkwatch_fire_event(dev);
256 	if (netif_running(dev))
257 		__netdev_watchdog_up(dev);
258 }
259 
260 void netif_carrier_off(struct net_device *dev)
261 {
262 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
263 		linkwatch_fire_event(dev);
264 }
265 
266 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
267    under all circumstances. It is difficult to invent anything faster or
268    cheaper.
269  */
270 
271 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
272 {
273 	kfree_skb(skb);
274 	return NET_XMIT_CN;
275 }
276 
277 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
278 {
279 	return NULL;
280 }
281 
282 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
283 {
284 	if (net_ratelimit())
285 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
286 		       skb->dev->name);
287 	kfree_skb(skb);
288 	return NET_XMIT_CN;
289 }
290 
291 struct Qdisc_ops noop_qdisc_ops = {
292 	.id		=	"noop",
293 	.priv_size	=	0,
294 	.enqueue	=	noop_enqueue,
295 	.dequeue	=	noop_dequeue,
296 	.requeue	=	noop_requeue,
297 	.owner		=	THIS_MODULE,
298 };
299 
300 struct Qdisc noop_qdisc = {
301 	.enqueue	=	noop_enqueue,
302 	.dequeue	=	noop_dequeue,
303 	.flags		=	TCQ_F_BUILTIN,
304 	.ops		=	&noop_qdisc_ops,
305 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
306 };
307 
308 static struct Qdisc_ops noqueue_qdisc_ops = {
309 	.id		=	"noqueue",
310 	.priv_size	=	0,
311 	.enqueue	=	noop_enqueue,
312 	.dequeue	=	noop_dequeue,
313 	.requeue	=	noop_requeue,
314 	.owner		=	THIS_MODULE,
315 };
316 
317 static struct Qdisc noqueue_qdisc = {
318 	.enqueue	=	NULL,
319 	.dequeue	=	noop_dequeue,
320 	.flags		=	TCQ_F_BUILTIN,
321 	.ops		=	&noqueue_qdisc_ops,
322 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
323 };
324 
325 
326 static const u8 prio2band[TC_PRIO_MAX+1] =
327 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
328 
329 /* 3-band FIFO queue: old style, but should be a bit faster than
330    generic prio+fifo combination.
331  */
332 
333 #define PFIFO_FAST_BANDS 3
334 
335 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
336 					     struct Qdisc *qdisc)
337 {
338 	struct sk_buff_head *list = qdisc_priv(qdisc);
339 	return list + prio2band[skb->priority & TC_PRIO_MAX];
340 }
341 
342 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
343 {
344 	struct sk_buff_head *list = prio2list(skb, qdisc);
345 
346 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
347 		qdisc->q.qlen++;
348 		return __qdisc_enqueue_tail(skb, qdisc, list);
349 	}
350 
351 	return qdisc_drop(skb, qdisc);
352 }
353 
354 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
355 {
356 	int prio;
357 	struct sk_buff_head *list = qdisc_priv(qdisc);
358 
359 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
360 		if (!skb_queue_empty(list + prio)) {
361 			qdisc->q.qlen--;
362 			return __qdisc_dequeue_head(qdisc, list + prio);
363 		}
364 	}
365 
366 	return NULL;
367 }
368 
369 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
370 {
371 	qdisc->q.qlen++;
372 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
373 }
374 
375 static void pfifo_fast_reset(struct Qdisc* qdisc)
376 {
377 	int prio;
378 	struct sk_buff_head *list = qdisc_priv(qdisc);
379 
380 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
381 		__qdisc_reset_queue(qdisc, list + prio);
382 
383 	qdisc->qstats.backlog = 0;
384 	qdisc->q.qlen = 0;
385 }
386 
387 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
388 {
389 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
390 
391 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
392 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
393 	return skb->len;
394 
395 rtattr_failure:
396 	return -1;
397 }
398 
399 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
400 {
401 	int prio;
402 	struct sk_buff_head *list = qdisc_priv(qdisc);
403 
404 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
405 		skb_queue_head_init(list + prio);
406 
407 	return 0;
408 }
409 
410 static struct Qdisc_ops pfifo_fast_ops = {
411 	.id		=	"pfifo_fast",
412 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
413 	.enqueue	=	pfifo_fast_enqueue,
414 	.dequeue	=	pfifo_fast_dequeue,
415 	.requeue	=	pfifo_fast_requeue,
416 	.init		=	pfifo_fast_init,
417 	.reset		=	pfifo_fast_reset,
418 	.dump		=	pfifo_fast_dump,
419 	.owner		=	THIS_MODULE,
420 };
421 
422 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
423 {
424 	void *p;
425 	struct Qdisc *sch;
426 	unsigned int size;
427 	int err = -ENOBUFS;
428 
429 	/* ensure that the Qdisc and the private data are 32-byte aligned */
430 	size = QDISC_ALIGN(sizeof(*sch));
431 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
432 
433 	p = kzalloc(size, GFP_KERNEL);
434 	if (!p)
435 		goto errout;
436 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
437 	sch->padded = (char *) sch - (char *) p;
438 
439 	INIT_LIST_HEAD(&sch->list);
440 	skb_queue_head_init(&sch->q);
441 	sch->ops = ops;
442 	sch->enqueue = ops->enqueue;
443 	sch->dequeue = ops->dequeue;
444 	sch->dev = dev;
445 	dev_hold(dev);
446 	sch->stats_lock = &dev->queue_lock;
447 	atomic_set(&sch->refcnt, 1);
448 
449 	return sch;
450 errout:
451 	return ERR_PTR(-err);
452 }
453 
454 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
455 {
456 	struct Qdisc *sch;
457 
458 	sch = qdisc_alloc(dev, ops);
459 	if (IS_ERR(sch))
460 		goto errout;
461 
462 	if (!ops->init || ops->init(sch, NULL) == 0)
463 		return sch;
464 
465 	qdisc_destroy(sch);
466 errout:
467 	return NULL;
468 }
469 
470 /* Under dev->queue_lock and BH! */
471 
472 void qdisc_reset(struct Qdisc *qdisc)
473 {
474 	struct Qdisc_ops *ops = qdisc->ops;
475 
476 	if (ops->reset)
477 		ops->reset(qdisc);
478 }
479 
480 /* this is the rcu callback function to clean up a qdisc when there
481  * are no further references to it */
482 
483 static void __qdisc_destroy(struct rcu_head *head)
484 {
485 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
486 	struct Qdisc_ops  *ops = qdisc->ops;
487 
488 #ifdef CONFIG_NET_ESTIMATOR
489 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
490 #endif
491 	write_lock(&qdisc_tree_lock);
492 	if (ops->reset)
493 		ops->reset(qdisc);
494 	if (ops->destroy)
495 		ops->destroy(qdisc);
496 	write_unlock(&qdisc_tree_lock);
497 	module_put(ops->owner);
498 
499 	dev_put(qdisc->dev);
500 	kfree((char *) qdisc - qdisc->padded);
501 }
502 
503 /* Under dev->queue_lock and BH! */
504 
505 void qdisc_destroy(struct Qdisc *qdisc)
506 {
507 	struct list_head cql = LIST_HEAD_INIT(cql);
508 	struct Qdisc *cq, *q, *n;
509 
510 	if (qdisc->flags & TCQ_F_BUILTIN ||
511 		!atomic_dec_and_test(&qdisc->refcnt))
512 		return;
513 
514 	if (!list_empty(&qdisc->list)) {
515 		if (qdisc->ops->cl_ops == NULL)
516 			list_del(&qdisc->list);
517 		else
518 			list_move(&qdisc->list, &cql);
519 	}
520 
521 	/* unlink inner qdiscs from dev->qdisc_list immediately */
522 	list_for_each_entry(cq, &cql, list)
523 		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
524 			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
525 				if (q->ops->cl_ops == NULL)
526 					list_del_init(&q->list);
527 				else
528 					list_move_tail(&q->list, &cql);
529 			}
530 	list_for_each_entry_safe(cq, n, &cql, list)
531 		list_del_init(&cq->list);
532 
533 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
534 }
535 
536 void dev_activate(struct net_device *dev)
537 {
538 	/* No queueing discipline is attached to device;
539 	   create default one i.e. pfifo_fast for devices,
540 	   which need queueing and noqueue_qdisc for
541 	   virtual interfaces
542 	 */
543 
544 	if (dev->qdisc_sleeping == &noop_qdisc) {
545 		struct Qdisc *qdisc;
546 		if (dev->tx_queue_len) {
547 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
548 			if (qdisc == NULL) {
549 				printk(KERN_INFO "%s: activation failed\n", dev->name);
550 				return;
551 			}
552 			write_lock_bh(&qdisc_tree_lock);
553 			list_add_tail(&qdisc->list, &dev->qdisc_list);
554 			write_unlock_bh(&qdisc_tree_lock);
555 		} else {
556 			qdisc =  &noqueue_qdisc;
557 		}
558 		write_lock_bh(&qdisc_tree_lock);
559 		dev->qdisc_sleeping = qdisc;
560 		write_unlock_bh(&qdisc_tree_lock);
561 	}
562 
563 	if (!netif_carrier_ok(dev))
564 		/* Delay activation until next carrier-on event */
565 		return;
566 
567 	spin_lock_bh(&dev->queue_lock);
568 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
569 	if (dev->qdisc != &noqueue_qdisc) {
570 		dev->trans_start = jiffies;
571 		dev_watchdog_up(dev);
572 	}
573 	spin_unlock_bh(&dev->queue_lock);
574 }
575 
576 void dev_deactivate(struct net_device *dev)
577 {
578 	struct Qdisc *qdisc;
579 
580 	spin_lock_bh(&dev->queue_lock);
581 	qdisc = dev->qdisc;
582 	dev->qdisc = &noop_qdisc;
583 
584 	qdisc_reset(qdisc);
585 
586 	spin_unlock_bh(&dev->queue_lock);
587 
588 	dev_watchdog_down(dev);
589 
590 	/* Wait for outstanding dev_queue_xmit calls. */
591 	synchronize_rcu();
592 
593 	/* Wait for outstanding qdisc_run calls. */
594 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
595 		yield();
596 
597 	if (dev->gso_skb) {
598 		kfree_skb(dev->gso_skb);
599 		dev->gso_skb = NULL;
600 	}
601 }
602 
603 void dev_init_scheduler(struct net_device *dev)
604 {
605 	qdisc_lock_tree(dev);
606 	dev->qdisc = &noop_qdisc;
607 	dev->qdisc_sleeping = &noop_qdisc;
608 	INIT_LIST_HEAD(&dev->qdisc_list);
609 	qdisc_unlock_tree(dev);
610 
611 	dev_watchdog_init(dev);
612 }
613 
614 void dev_shutdown(struct net_device *dev)
615 {
616 	struct Qdisc *qdisc;
617 
618 	qdisc_lock_tree(dev);
619 	qdisc = dev->qdisc_sleeping;
620 	dev->qdisc = &noop_qdisc;
621 	dev->qdisc_sleeping = &noop_qdisc;
622 	qdisc_destroy(qdisc);
623 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
624         if ((qdisc = dev->qdisc_ingress) != NULL) {
625 		dev->qdisc_ingress = NULL;
626 		qdisc_destroy(qdisc);
627         }
628 #endif
629 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
630 	qdisc_unlock_tree(dev);
631 }
632 
633 EXPORT_SYMBOL(__netdev_watchdog_up);
634 EXPORT_SYMBOL(netif_carrier_on);
635 EXPORT_SYMBOL(netif_carrier_off);
636 EXPORT_SYMBOL(noop_qdisc);
637 EXPORT_SYMBOL(noop_qdisc_ops);
638 EXPORT_SYMBOL(qdisc_create_dflt);
639 EXPORT_SYMBOL(qdisc_alloc);
640 EXPORT_SYMBOL(qdisc_destroy);
641 EXPORT_SYMBOL(qdisc_reset);
642 EXPORT_SYMBOL(qdisc_lock_tree);
643 EXPORT_SYMBOL(qdisc_unlock_tree);
644