xref: /openbmc/linux/net/sched/sch_generic.c (revision cce1fa36)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49 
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52 	spin_lock_bh(&dev->queue_lock);
53 	spin_lock(&dev->ingress_lock);
54 }
55 
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58 	spin_unlock(&dev->ingress_lock);
59 	spin_unlock_bh(&dev->queue_lock);
60 }
61 
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65 
66    netif_tx_lock serializes accesses to device driver.
67 
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71 
72 
73 /* Kick device.
74 
75    Returns:  0  - queue is empty or throttled.
76 	    >0  - queue is not empty.
77 
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80 
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83 	struct Qdisc *q = dev->qdisc;
84 	struct sk_buff *skb;
85 
86 	/* Dequeue packet */
87 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88 		unsigned nolock = (dev->features & NETIF_F_LLTX);
89 
90 		dev->gso_skb = NULL;
91 
92 		/*
93 		 * When the driver has LLTX set it does its own locking
94 		 * in start_xmit. No need to add additional overhead by
95 		 * locking again. These checks are worth it because
96 		 * even uncongested locks can be quite expensive.
97 		 * The driver can do trylock like here too, in case
98 		 * of lock congestion it should return -1 and the packet
99 		 * will be requeued.
100 		 */
101 		if (!nolock) {
102 			if (!netif_tx_trylock(dev)) {
103 			collision:
104 				/* So, someone grabbed the driver. */
105 
106 				/* It may be transient configuration error,
107 				   when hard_start_xmit() recurses. We detect
108 				   it by checking xmit owner and drop the
109 				   packet when deadloop is detected.
110 				*/
111 				if (dev->xmit_lock_owner == smp_processor_id()) {
112 					kfree_skb(skb);
113 					if (net_ratelimit())
114 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115 					goto out;
116 				}
117 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
118 				goto requeue;
119 			}
120 		}
121 
122 		{
123 			/* And release queue */
124 			spin_unlock(&dev->queue_lock);
125 
126 			if (!netif_queue_stopped(dev)) {
127 				int ret;
128 
129 				ret = dev_hard_start_xmit(skb, dev);
130 				if (ret == NETDEV_TX_OK) {
131 					if (!nolock) {
132 						netif_tx_unlock(dev);
133 					}
134 					spin_lock(&dev->queue_lock);
135 					q = dev->qdisc;
136 					goto out;
137 				}
138 				if (ret == NETDEV_TX_LOCKED && nolock) {
139 					spin_lock(&dev->queue_lock);
140 					q = dev->qdisc;
141 					goto collision;
142 				}
143 			}
144 
145 			/* NETDEV_TX_BUSY - we need to requeue */
146 			/* Release the driver */
147 			if (!nolock) {
148 				netif_tx_unlock(dev);
149 			}
150 			spin_lock(&dev->queue_lock);
151 			q = dev->qdisc;
152 		}
153 
154 		/* Device kicked us out :(
155 		   This is possible in three cases:
156 
157 		   0. driver is locked
158 		   1. fastroute is enabled
159 		   2. device cannot determine busy state
160 		      before start of transmission (f.e. dialout)
161 		   3. device is buggy (ppp)
162 		 */
163 
164 requeue:
165 		if (skb->next)
166 			dev->gso_skb = skb;
167 		else
168 			q->ops->requeue(skb, q);
169 		netif_schedule(dev);
170 		return 0;
171 	}
172 
173 out:
174 	BUG_ON((int) q->q.qlen < 0);
175 	return q->q.qlen;
176 }
177 
178 void __qdisc_run(struct net_device *dev)
179 {
180 	if (unlikely(dev->qdisc == &noop_qdisc))
181 		goto out;
182 
183 	do {
184 		if (!qdisc_restart(dev))
185 			break;
186 	} while (!netif_queue_stopped(dev));
187 
188 out:
189 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
190 }
191 
192 static void dev_watchdog(unsigned long arg)
193 {
194 	struct net_device *dev = (struct net_device *)arg;
195 
196 	netif_tx_lock(dev);
197 	if (dev->qdisc != &noop_qdisc) {
198 		if (netif_device_present(dev) &&
199 		    netif_running(dev) &&
200 		    netif_carrier_ok(dev)) {
201 			if (netif_queue_stopped(dev) &&
202 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
203 
204 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
205 				       dev->name);
206 				dev->tx_timeout(dev);
207 			}
208 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
209 				dev_hold(dev);
210 		}
211 	}
212 	netif_tx_unlock(dev);
213 
214 	dev_put(dev);
215 }
216 
217 static void dev_watchdog_init(struct net_device *dev)
218 {
219 	init_timer(&dev->watchdog_timer);
220 	dev->watchdog_timer.data = (unsigned long)dev;
221 	dev->watchdog_timer.function = dev_watchdog;
222 }
223 
224 void __netdev_watchdog_up(struct net_device *dev)
225 {
226 	if (dev->tx_timeout) {
227 		if (dev->watchdog_timeo <= 0)
228 			dev->watchdog_timeo = 5*HZ;
229 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
230 			dev_hold(dev);
231 	}
232 }
233 
234 static void dev_watchdog_up(struct net_device *dev)
235 {
236 	__netdev_watchdog_up(dev);
237 }
238 
239 static void dev_watchdog_down(struct net_device *dev)
240 {
241 	netif_tx_lock_bh(dev);
242 	if (del_timer(&dev->watchdog_timer))
243 		dev_put(dev);
244 	netif_tx_unlock_bh(dev);
245 }
246 
247 void netif_carrier_on(struct net_device *dev)
248 {
249 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
250 		linkwatch_fire_event(dev);
251 	if (netif_running(dev))
252 		__netdev_watchdog_up(dev);
253 }
254 
255 void netif_carrier_off(struct net_device *dev)
256 {
257 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
258 		linkwatch_fire_event(dev);
259 }
260 
261 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
262    under all circumstances. It is difficult to invent anything faster or
263    cheaper.
264  */
265 
266 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
267 {
268 	kfree_skb(skb);
269 	return NET_XMIT_CN;
270 }
271 
272 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
273 {
274 	return NULL;
275 }
276 
277 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
278 {
279 	if (net_ratelimit())
280 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
281 		       skb->dev->name);
282 	kfree_skb(skb);
283 	return NET_XMIT_CN;
284 }
285 
286 struct Qdisc_ops noop_qdisc_ops = {
287 	.id		=	"noop",
288 	.priv_size	=	0,
289 	.enqueue	=	noop_enqueue,
290 	.dequeue	=	noop_dequeue,
291 	.requeue	=	noop_requeue,
292 	.owner		=	THIS_MODULE,
293 };
294 
295 struct Qdisc noop_qdisc = {
296 	.enqueue	=	noop_enqueue,
297 	.dequeue	=	noop_dequeue,
298 	.flags		=	TCQ_F_BUILTIN,
299 	.ops		=	&noop_qdisc_ops,
300 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
301 };
302 
303 static struct Qdisc_ops noqueue_qdisc_ops = {
304 	.id		=	"noqueue",
305 	.priv_size	=	0,
306 	.enqueue	=	noop_enqueue,
307 	.dequeue	=	noop_dequeue,
308 	.requeue	=	noop_requeue,
309 	.owner		=	THIS_MODULE,
310 };
311 
312 static struct Qdisc noqueue_qdisc = {
313 	.enqueue	=	NULL,
314 	.dequeue	=	noop_dequeue,
315 	.flags		=	TCQ_F_BUILTIN,
316 	.ops		=	&noqueue_qdisc_ops,
317 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
318 };
319 
320 
321 static const u8 prio2band[TC_PRIO_MAX+1] =
322 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
323 
324 /* 3-band FIFO queue: old style, but should be a bit faster than
325    generic prio+fifo combination.
326  */
327 
328 #define PFIFO_FAST_BANDS 3
329 
330 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
331 					     struct Qdisc *qdisc)
332 {
333 	struct sk_buff_head *list = qdisc_priv(qdisc);
334 	return list + prio2band[skb->priority & TC_PRIO_MAX];
335 }
336 
337 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
338 {
339 	struct sk_buff_head *list = prio2list(skb, qdisc);
340 
341 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
342 		qdisc->q.qlen++;
343 		return __qdisc_enqueue_tail(skb, qdisc, list);
344 	}
345 
346 	return qdisc_drop(skb, qdisc);
347 }
348 
349 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
350 {
351 	int prio;
352 	struct sk_buff_head *list = qdisc_priv(qdisc);
353 
354 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
355 		if (!skb_queue_empty(list + prio)) {
356 			qdisc->q.qlen--;
357 			return __qdisc_dequeue_head(qdisc, list + prio);
358 		}
359 	}
360 
361 	return NULL;
362 }
363 
364 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
365 {
366 	qdisc->q.qlen++;
367 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
368 }
369 
370 static void pfifo_fast_reset(struct Qdisc* qdisc)
371 {
372 	int prio;
373 	struct sk_buff_head *list = qdisc_priv(qdisc);
374 
375 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
376 		__qdisc_reset_queue(qdisc, list + prio);
377 
378 	qdisc->qstats.backlog = 0;
379 	qdisc->q.qlen = 0;
380 }
381 
382 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
383 {
384 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
385 
386 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
387 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
388 	return skb->len;
389 
390 rtattr_failure:
391 	return -1;
392 }
393 
394 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
395 {
396 	int prio;
397 	struct sk_buff_head *list = qdisc_priv(qdisc);
398 
399 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
400 		skb_queue_head_init(list + prio);
401 
402 	return 0;
403 }
404 
405 static struct Qdisc_ops pfifo_fast_ops = {
406 	.id		=	"pfifo_fast",
407 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
408 	.enqueue	=	pfifo_fast_enqueue,
409 	.dequeue	=	pfifo_fast_dequeue,
410 	.requeue	=	pfifo_fast_requeue,
411 	.init		=	pfifo_fast_init,
412 	.reset		=	pfifo_fast_reset,
413 	.dump		=	pfifo_fast_dump,
414 	.owner		=	THIS_MODULE,
415 };
416 
417 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
418 {
419 	void *p;
420 	struct Qdisc *sch;
421 	unsigned int size;
422 	int err = -ENOBUFS;
423 
424 	/* ensure that the Qdisc and the private data are 32-byte aligned */
425 	size = QDISC_ALIGN(sizeof(*sch));
426 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
427 
428 	p = kzalloc(size, GFP_KERNEL);
429 	if (!p)
430 		goto errout;
431 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
432 	sch->padded = (char *) sch - (char *) p;
433 
434 	INIT_LIST_HEAD(&sch->list);
435 	skb_queue_head_init(&sch->q);
436 	sch->ops = ops;
437 	sch->enqueue = ops->enqueue;
438 	sch->dequeue = ops->dequeue;
439 	sch->dev = dev;
440 	dev_hold(dev);
441 	atomic_set(&sch->refcnt, 1);
442 
443 	return sch;
444 errout:
445 	return ERR_PTR(-err);
446 }
447 
448 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
449 				 unsigned int parentid)
450 {
451 	struct Qdisc *sch;
452 
453 	sch = qdisc_alloc(dev, ops);
454 	if (IS_ERR(sch))
455 		goto errout;
456 	sch->stats_lock = &dev->queue_lock;
457 	sch->parent = parentid;
458 
459 	if (!ops->init || ops->init(sch, NULL) == 0)
460 		return sch;
461 
462 	qdisc_destroy(sch);
463 errout:
464 	return NULL;
465 }
466 
467 /* Under dev->queue_lock and BH! */
468 
469 void qdisc_reset(struct Qdisc *qdisc)
470 {
471 	struct Qdisc_ops *ops = qdisc->ops;
472 
473 	if (ops->reset)
474 		ops->reset(qdisc);
475 }
476 
477 /* this is the rcu callback function to clean up a qdisc when there
478  * are no further references to it */
479 
480 static void __qdisc_destroy(struct rcu_head *head)
481 {
482 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
483 	kfree((char *) qdisc - qdisc->padded);
484 }
485 
486 /* Under dev->queue_lock and BH! */
487 
488 void qdisc_destroy(struct Qdisc *qdisc)
489 {
490 	struct Qdisc_ops  *ops = qdisc->ops;
491 
492 	if (qdisc->flags & TCQ_F_BUILTIN ||
493 	    !atomic_dec_and_test(&qdisc->refcnt))
494 		return;
495 
496 	list_del(&qdisc->list);
497 #ifdef CONFIG_NET_ESTIMATOR
498 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
499 #endif
500 	if (ops->reset)
501 		ops->reset(qdisc);
502 	if (ops->destroy)
503 		ops->destroy(qdisc);
504 
505 	module_put(ops->owner);
506 	dev_put(qdisc->dev);
507 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
508 }
509 
510 void dev_activate(struct net_device *dev)
511 {
512 	/* No queueing discipline is attached to device;
513 	   create default one i.e. pfifo_fast for devices,
514 	   which need queueing and noqueue_qdisc for
515 	   virtual interfaces
516 	 */
517 
518 	if (dev->qdisc_sleeping == &noop_qdisc) {
519 		struct Qdisc *qdisc;
520 		if (dev->tx_queue_len) {
521 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
522 						  TC_H_ROOT);
523 			if (qdisc == NULL) {
524 				printk(KERN_INFO "%s: activation failed\n", dev->name);
525 				return;
526 			}
527 			list_add_tail(&qdisc->list, &dev->qdisc_list);
528 		} else {
529 			qdisc =  &noqueue_qdisc;
530 		}
531 		dev->qdisc_sleeping = qdisc;
532 	}
533 
534 	if (!netif_carrier_ok(dev))
535 		/* Delay activation until next carrier-on event */
536 		return;
537 
538 	spin_lock_bh(&dev->queue_lock);
539 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
540 	if (dev->qdisc != &noqueue_qdisc) {
541 		dev->trans_start = jiffies;
542 		dev_watchdog_up(dev);
543 	}
544 	spin_unlock_bh(&dev->queue_lock);
545 }
546 
547 void dev_deactivate(struct net_device *dev)
548 {
549 	struct Qdisc *qdisc;
550 
551 	spin_lock_bh(&dev->queue_lock);
552 	qdisc = dev->qdisc;
553 	dev->qdisc = &noop_qdisc;
554 
555 	qdisc_reset(qdisc);
556 
557 	spin_unlock_bh(&dev->queue_lock);
558 
559 	dev_watchdog_down(dev);
560 
561 	/* Wait for outstanding dev_queue_xmit calls. */
562 	synchronize_rcu();
563 
564 	/* Wait for outstanding qdisc_run calls. */
565 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
566 		yield();
567 
568 	if (dev->gso_skb) {
569 		kfree_skb(dev->gso_skb);
570 		dev->gso_skb = NULL;
571 	}
572 }
573 
574 void dev_init_scheduler(struct net_device *dev)
575 {
576 	qdisc_lock_tree(dev);
577 	dev->qdisc = &noop_qdisc;
578 	dev->qdisc_sleeping = &noop_qdisc;
579 	INIT_LIST_HEAD(&dev->qdisc_list);
580 	qdisc_unlock_tree(dev);
581 
582 	dev_watchdog_init(dev);
583 }
584 
585 void dev_shutdown(struct net_device *dev)
586 {
587 	struct Qdisc *qdisc;
588 
589 	qdisc_lock_tree(dev);
590 	qdisc = dev->qdisc_sleeping;
591 	dev->qdisc = &noop_qdisc;
592 	dev->qdisc_sleeping = &noop_qdisc;
593 	qdisc_destroy(qdisc);
594 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
595 	if ((qdisc = dev->qdisc_ingress) != NULL) {
596 		dev->qdisc_ingress = NULL;
597 		qdisc_destroy(qdisc);
598 	}
599 #endif
600 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
601 	qdisc_unlock_tree(dev);
602 }
603 
604 EXPORT_SYMBOL(netif_carrier_on);
605 EXPORT_SYMBOL(netif_carrier_off);
606 EXPORT_SYMBOL(noop_qdisc);
607 EXPORT_SYMBOL(qdisc_create_dflt);
608 EXPORT_SYMBOL(qdisc_destroy);
609 EXPORT_SYMBOL(qdisc_reset);
610 EXPORT_SYMBOL(qdisc_lock_tree);
611 EXPORT_SYMBOL(qdisc_unlock_tree);
612