xref: /openbmc/linux/net/sched/sch_generic.c (revision d90df3ad)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49 
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52 	spin_lock_bh(&dev->queue_lock);
53 	spin_lock(&dev->ingress_lock);
54 }
55 
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58 	spin_unlock(&dev->ingress_lock);
59 	spin_unlock_bh(&dev->queue_lock);
60 }
61 
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65 
66    netif_tx_lock serializes accesses to device driver.
67 
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71 
72 
73 /* Kick device.
74 
75    Returns:  0  - queue is empty or throttled.
76 	    >0  - queue is not empty.
77 
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80 
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83 	struct Qdisc *q = dev->qdisc;
84 	struct sk_buff *skb;
85 
86 	/* Dequeue packet */
87 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88 		unsigned nolock = (dev->features & NETIF_F_LLTX);
89 
90 		dev->gso_skb = NULL;
91 
92 		/*
93 		 * When the driver has LLTX set it does its own locking
94 		 * in start_xmit. No need to add additional overhead by
95 		 * locking again. These checks are worth it because
96 		 * even uncongested locks can be quite expensive.
97 		 * The driver can do trylock like here too, in case
98 		 * of lock congestion it should return -1 and the packet
99 		 * will be requeued.
100 		 */
101 		if (!nolock) {
102 			if (!netif_tx_trylock(dev)) {
103 			collision:
104 				/* So, someone grabbed the driver. */
105 
106 				/* It may be transient configuration error,
107 				   when hard_start_xmit() recurses. We detect
108 				   it by checking xmit owner and drop the
109 				   packet when deadloop is detected.
110 				*/
111 				if (dev->xmit_lock_owner == smp_processor_id()) {
112 					kfree_skb(skb);
113 					if (net_ratelimit())
114 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115 					goto out;
116 				}
117 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
118 				goto requeue;
119 			}
120 		}
121 
122 		{
123 			/* And release queue */
124 			spin_unlock(&dev->queue_lock);
125 
126 			if (!netif_queue_stopped(dev)) {
127 				int ret;
128 
129 				ret = dev_hard_start_xmit(skb, dev);
130 				if (ret == NETDEV_TX_OK) {
131 					if (!nolock) {
132 						netif_tx_unlock(dev);
133 					}
134 					spin_lock(&dev->queue_lock);
135 					goto out;
136 				}
137 				if (ret == NETDEV_TX_LOCKED && nolock) {
138 					spin_lock(&dev->queue_lock);
139 					q = dev->qdisc;
140 					goto collision;
141 				}
142 			}
143 
144 			/* NETDEV_TX_BUSY - we need to requeue */
145 			/* Release the driver */
146 			if (!nolock) {
147 				netif_tx_unlock(dev);
148 			}
149 			spin_lock(&dev->queue_lock);
150 			q = dev->qdisc;
151 		}
152 
153 		/* Device kicked us out :(
154 		   This is possible in three cases:
155 
156 		   0. driver is locked
157 		   1. fastroute is enabled
158 		   2. device cannot determine busy state
159 		      before start of transmission (f.e. dialout)
160 		   3. device is buggy (ppp)
161 		 */
162 
163 requeue:
164 		if (skb->next)
165 			dev->gso_skb = skb;
166 		else
167 			q->ops->requeue(skb, q);
168 		netif_schedule(dev);
169 		return 0;
170 	}
171 
172 out:
173 	BUG_ON((int) q->q.qlen < 0);
174 	return q->q.qlen;
175 }
176 
177 void __qdisc_run(struct net_device *dev)
178 {
179 	if (unlikely(dev->qdisc == &noop_qdisc))
180 		goto out;
181 
182 	do {
183 		if (!qdisc_restart(dev))
184 			break;
185 	} while (!netif_queue_stopped(dev));
186 
187 out:
188 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
189 }
190 
191 static void dev_watchdog(unsigned long arg)
192 {
193 	struct net_device *dev = (struct net_device *)arg;
194 
195 	netif_tx_lock(dev);
196 	if (dev->qdisc != &noop_qdisc) {
197 		if (netif_device_present(dev) &&
198 		    netif_running(dev) &&
199 		    netif_carrier_ok(dev)) {
200 			if (netif_queue_stopped(dev) &&
201 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
202 
203 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
204 				       dev->name);
205 				dev->tx_timeout(dev);
206 			}
207 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
208 				dev_hold(dev);
209 		}
210 	}
211 	netif_tx_unlock(dev);
212 
213 	dev_put(dev);
214 }
215 
216 static void dev_watchdog_init(struct net_device *dev)
217 {
218 	init_timer(&dev->watchdog_timer);
219 	dev->watchdog_timer.data = (unsigned long)dev;
220 	dev->watchdog_timer.function = dev_watchdog;
221 }
222 
223 void __netdev_watchdog_up(struct net_device *dev)
224 {
225 	if (dev->tx_timeout) {
226 		if (dev->watchdog_timeo <= 0)
227 			dev->watchdog_timeo = 5*HZ;
228 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
229 			dev_hold(dev);
230 	}
231 }
232 
233 static void dev_watchdog_up(struct net_device *dev)
234 {
235 	__netdev_watchdog_up(dev);
236 }
237 
238 static void dev_watchdog_down(struct net_device *dev)
239 {
240 	netif_tx_lock_bh(dev);
241 	if (del_timer(&dev->watchdog_timer))
242 		dev_put(dev);
243 	netif_tx_unlock_bh(dev);
244 }
245 
246 void netif_carrier_on(struct net_device *dev)
247 {
248 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
249 		linkwatch_fire_event(dev);
250 	if (netif_running(dev))
251 		__netdev_watchdog_up(dev);
252 }
253 
254 void netif_carrier_off(struct net_device *dev)
255 {
256 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
257 		linkwatch_fire_event(dev);
258 }
259 
260 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
261    under all circumstances. It is difficult to invent anything faster or
262    cheaper.
263  */
264 
265 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
266 {
267 	kfree_skb(skb);
268 	return NET_XMIT_CN;
269 }
270 
271 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
272 {
273 	return NULL;
274 }
275 
276 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
277 {
278 	if (net_ratelimit())
279 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
280 		       skb->dev->name);
281 	kfree_skb(skb);
282 	return NET_XMIT_CN;
283 }
284 
285 struct Qdisc_ops noop_qdisc_ops = {
286 	.id		=	"noop",
287 	.priv_size	=	0,
288 	.enqueue	=	noop_enqueue,
289 	.dequeue	=	noop_dequeue,
290 	.requeue	=	noop_requeue,
291 	.owner		=	THIS_MODULE,
292 };
293 
294 struct Qdisc noop_qdisc = {
295 	.enqueue	=	noop_enqueue,
296 	.dequeue	=	noop_dequeue,
297 	.flags		=	TCQ_F_BUILTIN,
298 	.ops		=	&noop_qdisc_ops,
299 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
300 };
301 
302 static struct Qdisc_ops noqueue_qdisc_ops = {
303 	.id		=	"noqueue",
304 	.priv_size	=	0,
305 	.enqueue	=	noop_enqueue,
306 	.dequeue	=	noop_dequeue,
307 	.requeue	=	noop_requeue,
308 	.owner		=	THIS_MODULE,
309 };
310 
311 static struct Qdisc noqueue_qdisc = {
312 	.enqueue	=	NULL,
313 	.dequeue	=	noop_dequeue,
314 	.flags		=	TCQ_F_BUILTIN,
315 	.ops		=	&noqueue_qdisc_ops,
316 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
317 };
318 
319 
320 static const u8 prio2band[TC_PRIO_MAX+1] =
321 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
322 
323 /* 3-band FIFO queue: old style, but should be a bit faster than
324    generic prio+fifo combination.
325  */
326 
327 #define PFIFO_FAST_BANDS 3
328 
329 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
330 					     struct Qdisc *qdisc)
331 {
332 	struct sk_buff_head *list = qdisc_priv(qdisc);
333 	return list + prio2band[skb->priority & TC_PRIO_MAX];
334 }
335 
336 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
337 {
338 	struct sk_buff_head *list = prio2list(skb, qdisc);
339 
340 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
341 		qdisc->q.qlen++;
342 		return __qdisc_enqueue_tail(skb, qdisc, list);
343 	}
344 
345 	return qdisc_drop(skb, qdisc);
346 }
347 
348 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
349 {
350 	int prio;
351 	struct sk_buff_head *list = qdisc_priv(qdisc);
352 
353 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
354 		if (!skb_queue_empty(list + prio)) {
355 			qdisc->q.qlen--;
356 			return __qdisc_dequeue_head(qdisc, list + prio);
357 		}
358 	}
359 
360 	return NULL;
361 }
362 
363 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
364 {
365 	qdisc->q.qlen++;
366 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
367 }
368 
369 static void pfifo_fast_reset(struct Qdisc* qdisc)
370 {
371 	int prio;
372 	struct sk_buff_head *list = qdisc_priv(qdisc);
373 
374 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
375 		__qdisc_reset_queue(qdisc, list + prio);
376 
377 	qdisc->qstats.backlog = 0;
378 	qdisc->q.qlen = 0;
379 }
380 
381 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
382 {
383 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
384 
385 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
386 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
387 	return skb->len;
388 
389 rtattr_failure:
390 	return -1;
391 }
392 
393 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
394 {
395 	int prio;
396 	struct sk_buff_head *list = qdisc_priv(qdisc);
397 
398 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
399 		skb_queue_head_init(list + prio);
400 
401 	return 0;
402 }
403 
404 static struct Qdisc_ops pfifo_fast_ops = {
405 	.id		=	"pfifo_fast",
406 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
407 	.enqueue	=	pfifo_fast_enqueue,
408 	.dequeue	=	pfifo_fast_dequeue,
409 	.requeue	=	pfifo_fast_requeue,
410 	.init		=	pfifo_fast_init,
411 	.reset		=	pfifo_fast_reset,
412 	.dump		=	pfifo_fast_dump,
413 	.owner		=	THIS_MODULE,
414 };
415 
416 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
417 {
418 	void *p;
419 	struct Qdisc *sch;
420 	unsigned int size;
421 	int err = -ENOBUFS;
422 
423 	/* ensure that the Qdisc and the private data are 32-byte aligned */
424 	size = QDISC_ALIGN(sizeof(*sch));
425 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
426 
427 	p = kzalloc(size, GFP_KERNEL);
428 	if (!p)
429 		goto errout;
430 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
431 	sch->padded = (char *) sch - (char *) p;
432 
433 	INIT_LIST_HEAD(&sch->list);
434 	skb_queue_head_init(&sch->q);
435 	sch->ops = ops;
436 	sch->enqueue = ops->enqueue;
437 	sch->dequeue = ops->dequeue;
438 	sch->dev = dev;
439 	dev_hold(dev);
440 	atomic_set(&sch->refcnt, 1);
441 
442 	return sch;
443 errout:
444 	return ERR_PTR(-err);
445 }
446 
447 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
448 				 unsigned int parentid)
449 {
450 	struct Qdisc *sch;
451 
452 	sch = qdisc_alloc(dev, ops);
453 	if (IS_ERR(sch))
454 		goto errout;
455 	sch->stats_lock = &dev->queue_lock;
456 	sch->parent = parentid;
457 
458 	if (!ops->init || ops->init(sch, NULL) == 0)
459 		return sch;
460 
461 	qdisc_destroy(sch);
462 errout:
463 	return NULL;
464 }
465 
466 /* Under dev->queue_lock and BH! */
467 
468 void qdisc_reset(struct Qdisc *qdisc)
469 {
470 	struct Qdisc_ops *ops = qdisc->ops;
471 
472 	if (ops->reset)
473 		ops->reset(qdisc);
474 }
475 
476 /* this is the rcu callback function to clean up a qdisc when there
477  * are no further references to it */
478 
479 static void __qdisc_destroy(struct rcu_head *head)
480 {
481 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
482 	kfree((char *) qdisc - qdisc->padded);
483 }
484 
485 /* Under dev->queue_lock and BH! */
486 
487 void qdisc_destroy(struct Qdisc *qdisc)
488 {
489 	struct Qdisc_ops  *ops = qdisc->ops;
490 
491 	if (qdisc->flags & TCQ_F_BUILTIN ||
492 	    !atomic_dec_and_test(&qdisc->refcnt))
493 		return;
494 
495 	list_del(&qdisc->list);
496 #ifdef CONFIG_NET_ESTIMATOR
497 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
498 #endif
499 	if (ops->reset)
500 		ops->reset(qdisc);
501 	if (ops->destroy)
502 		ops->destroy(qdisc);
503 
504 	module_put(ops->owner);
505 	dev_put(qdisc->dev);
506 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
507 }
508 
509 void dev_activate(struct net_device *dev)
510 {
511 	/* No queueing discipline is attached to device;
512 	   create default one i.e. pfifo_fast for devices,
513 	   which need queueing and noqueue_qdisc for
514 	   virtual interfaces
515 	 */
516 
517 	if (dev->qdisc_sleeping == &noop_qdisc) {
518 		struct Qdisc *qdisc;
519 		if (dev->tx_queue_len) {
520 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
521 						  TC_H_ROOT);
522 			if (qdisc == NULL) {
523 				printk(KERN_INFO "%s: activation failed\n", dev->name);
524 				return;
525 			}
526 			list_add_tail(&qdisc->list, &dev->qdisc_list);
527 		} else {
528 			qdisc =  &noqueue_qdisc;
529 		}
530 		dev->qdisc_sleeping = qdisc;
531 	}
532 
533 	if (!netif_carrier_ok(dev))
534 		/* Delay activation until next carrier-on event */
535 		return;
536 
537 	spin_lock_bh(&dev->queue_lock);
538 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
539 	if (dev->qdisc != &noqueue_qdisc) {
540 		dev->trans_start = jiffies;
541 		dev_watchdog_up(dev);
542 	}
543 	spin_unlock_bh(&dev->queue_lock);
544 }
545 
546 void dev_deactivate(struct net_device *dev)
547 {
548 	struct Qdisc *qdisc;
549 
550 	spin_lock_bh(&dev->queue_lock);
551 	qdisc = dev->qdisc;
552 	dev->qdisc = &noop_qdisc;
553 
554 	qdisc_reset(qdisc);
555 
556 	spin_unlock_bh(&dev->queue_lock);
557 
558 	dev_watchdog_down(dev);
559 
560 	/* Wait for outstanding dev_queue_xmit calls. */
561 	synchronize_rcu();
562 
563 	/* Wait for outstanding qdisc_run calls. */
564 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
565 		yield();
566 
567 	if (dev->gso_skb) {
568 		kfree_skb(dev->gso_skb);
569 		dev->gso_skb = NULL;
570 	}
571 }
572 
573 void dev_init_scheduler(struct net_device *dev)
574 {
575 	qdisc_lock_tree(dev);
576 	dev->qdisc = &noop_qdisc;
577 	dev->qdisc_sleeping = &noop_qdisc;
578 	INIT_LIST_HEAD(&dev->qdisc_list);
579 	qdisc_unlock_tree(dev);
580 
581 	dev_watchdog_init(dev);
582 }
583 
584 void dev_shutdown(struct net_device *dev)
585 {
586 	struct Qdisc *qdisc;
587 
588 	qdisc_lock_tree(dev);
589 	qdisc = dev->qdisc_sleeping;
590 	dev->qdisc = &noop_qdisc;
591 	dev->qdisc_sleeping = &noop_qdisc;
592 	qdisc_destroy(qdisc);
593 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
594 	if ((qdisc = dev->qdisc_ingress) != NULL) {
595 		dev->qdisc_ingress = NULL;
596 		qdisc_destroy(qdisc);
597 	}
598 #endif
599 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
600 	qdisc_unlock_tree(dev);
601 }
602 
603 EXPORT_SYMBOL(netif_carrier_on);
604 EXPORT_SYMBOL(netif_carrier_off);
605 EXPORT_SYMBOL(noop_qdisc);
606 EXPORT_SYMBOL(qdisc_create_dflt);
607 EXPORT_SYMBOL(qdisc_destroy);
608 EXPORT_SYMBOL(qdisc_reset);
609 EXPORT_SYMBOL(qdisc_lock_tree);
610 EXPORT_SYMBOL(qdisc_unlock_tree);
611