xref: /openbmc/linux/net/sched/sch_generic.c (revision 60468d5b)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - ingress filtering is serialized via top level device
46  *   spinlock dev->ingress_lock.
47  * - updates to tree and tree walking are only done under the rtnl mutex.
48  */
49 
50 void qdisc_lock_tree(struct net_device *dev)
51 {
52 	spin_lock_bh(&dev->queue_lock);
53 	spin_lock(&dev->ingress_lock);
54 }
55 
56 void qdisc_unlock_tree(struct net_device *dev)
57 {
58 	spin_unlock(&dev->ingress_lock);
59 	spin_unlock_bh(&dev->queue_lock);
60 }
61 
62 /*
63    dev->queue_lock serializes queue accesses for this device
64    AND dev->qdisc pointer itself.
65 
66    netif_tx_lock serializes accesses to device driver.
67 
68    dev->queue_lock and netif_tx_lock are mutually exclusive,
69    if one is grabbed, another must be free.
70  */
71 
72 
73 /* Kick device.
74 
75    Returns:  0  - queue is empty or throttled.
76 	    >0  - queue is not empty.
77 
78    NOTE: Called under dev->queue_lock with locally disabled BH.
79 */
80 
81 static inline int qdisc_restart(struct net_device *dev)
82 {
83 	struct Qdisc *q = dev->qdisc;
84 	struct sk_buff *skb;
85 
86 	/* Dequeue packet */
87 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
88 		unsigned nolock = (dev->features & NETIF_F_LLTX);
89 
90 		dev->gso_skb = NULL;
91 
92 		/*
93 		 * When the driver has LLTX set it does its own locking
94 		 * in start_xmit. No need to add additional overhead by
95 		 * locking again. These checks are worth it because
96 		 * even uncongested locks can be quite expensive.
97 		 * The driver can do trylock like here too, in case
98 		 * of lock congestion it should return -1 and the packet
99 		 * will be requeued.
100 		 */
101 		if (!nolock) {
102 			if (!netif_tx_trylock(dev)) {
103 			collision:
104 				/* So, someone grabbed the driver. */
105 
106 				/* It may be transient configuration error,
107 				   when hard_start_xmit() recurses. We detect
108 				   it by checking xmit owner and drop the
109 				   packet when deadloop is detected.
110 				*/
111 				if (dev->xmit_lock_owner == smp_processor_id()) {
112 					kfree_skb(skb);
113 					if (net_ratelimit())
114 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
115 					goto out;
116 				}
117 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
118 				goto requeue;
119 			}
120 		}
121 
122 		{
123 			/* And release queue */
124 			spin_unlock(&dev->queue_lock);
125 
126 			if (!netif_queue_stopped(dev)) {
127 				int ret;
128 
129 				ret = dev_hard_start_xmit(skb, dev);
130 				if (ret == NETDEV_TX_OK) {
131 					if (!nolock) {
132 						netif_tx_unlock(dev);
133 					}
134 					spin_lock(&dev->queue_lock);
135 					q = dev->qdisc;
136 					goto out;
137 				}
138 				if (ret == NETDEV_TX_LOCKED && nolock) {
139 					spin_lock(&dev->queue_lock);
140 					q = dev->qdisc;
141 					goto collision;
142 				}
143 			}
144 
145 			/* NETDEV_TX_BUSY - we need to requeue */
146 			/* Release the driver */
147 			if (!nolock) {
148 				netif_tx_unlock(dev);
149 			}
150 			spin_lock(&dev->queue_lock);
151 			q = dev->qdisc;
152 		}
153 
154 		/* Device kicked us out :(
155 		   This is possible in three cases:
156 
157 		   0. driver is locked
158 		   1. fastroute is enabled
159 		   2. device cannot determine busy state
160 		      before start of transmission (f.e. dialout)
161 		   3. device is buggy (ppp)
162 		 */
163 
164 requeue:
165 		if (unlikely(q == &noop_qdisc))
166 			kfree_skb(skb);
167 		else if (skb->next)
168 			dev->gso_skb = skb;
169 		else
170 			q->ops->requeue(skb, q);
171 		netif_schedule(dev);
172 	}
173 	return 0;
174 
175 out:
176 	BUG_ON((int) q->q.qlen < 0);
177 	return q->q.qlen;
178 }
179 
180 void __qdisc_run(struct net_device *dev)
181 {
182 	do {
183 		if (!qdisc_restart(dev))
184 			break;
185 	} while (!netif_queue_stopped(dev));
186 
187 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
188 }
189 
190 static void dev_watchdog(unsigned long arg)
191 {
192 	struct net_device *dev = (struct net_device *)arg;
193 
194 	netif_tx_lock(dev);
195 	if (dev->qdisc != &noop_qdisc) {
196 		if (netif_device_present(dev) &&
197 		    netif_running(dev) &&
198 		    netif_carrier_ok(dev)) {
199 			if (netif_queue_stopped(dev) &&
200 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
201 
202 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
203 				       dev->name);
204 				dev->tx_timeout(dev);
205 			}
206 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
207 				dev_hold(dev);
208 		}
209 	}
210 	netif_tx_unlock(dev);
211 
212 	dev_put(dev);
213 }
214 
215 static void dev_watchdog_init(struct net_device *dev)
216 {
217 	init_timer(&dev->watchdog_timer);
218 	dev->watchdog_timer.data = (unsigned long)dev;
219 	dev->watchdog_timer.function = dev_watchdog;
220 }
221 
222 void __netdev_watchdog_up(struct net_device *dev)
223 {
224 	if (dev->tx_timeout) {
225 		if (dev->watchdog_timeo <= 0)
226 			dev->watchdog_timeo = 5*HZ;
227 		if (!mod_timer(&dev->watchdog_timer,
228 			       round_jiffies(jiffies + dev->watchdog_timeo)))
229 			dev_hold(dev);
230 	}
231 }
232 
233 static void dev_watchdog_up(struct net_device *dev)
234 {
235 	__netdev_watchdog_up(dev);
236 }
237 
238 static void dev_watchdog_down(struct net_device *dev)
239 {
240 	netif_tx_lock_bh(dev);
241 	if (del_timer(&dev->watchdog_timer))
242 		dev_put(dev);
243 	netif_tx_unlock_bh(dev);
244 }
245 
246 void netif_carrier_on(struct net_device *dev)
247 {
248 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
249 		linkwatch_fire_event(dev);
250 	if (netif_running(dev))
251 		__netdev_watchdog_up(dev);
252 }
253 
254 void netif_carrier_off(struct net_device *dev)
255 {
256 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
257 		linkwatch_fire_event(dev);
258 }
259 
260 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
261    under all circumstances. It is difficult to invent anything faster or
262    cheaper.
263  */
264 
265 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
266 {
267 	kfree_skb(skb);
268 	return NET_XMIT_CN;
269 }
270 
271 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
272 {
273 	return NULL;
274 }
275 
276 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
277 {
278 	if (net_ratelimit())
279 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
280 		       skb->dev->name);
281 	kfree_skb(skb);
282 	return NET_XMIT_CN;
283 }
284 
285 struct Qdisc_ops noop_qdisc_ops = {
286 	.id		=	"noop",
287 	.priv_size	=	0,
288 	.enqueue	=	noop_enqueue,
289 	.dequeue	=	noop_dequeue,
290 	.requeue	=	noop_requeue,
291 	.owner		=	THIS_MODULE,
292 };
293 
294 struct Qdisc noop_qdisc = {
295 	.enqueue	=	noop_enqueue,
296 	.dequeue	=	noop_dequeue,
297 	.flags		=	TCQ_F_BUILTIN,
298 	.ops		=	&noop_qdisc_ops,
299 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
300 };
301 
302 static struct Qdisc_ops noqueue_qdisc_ops = {
303 	.id		=	"noqueue",
304 	.priv_size	=	0,
305 	.enqueue	=	noop_enqueue,
306 	.dequeue	=	noop_dequeue,
307 	.requeue	=	noop_requeue,
308 	.owner		=	THIS_MODULE,
309 };
310 
311 static struct Qdisc noqueue_qdisc = {
312 	.enqueue	=	NULL,
313 	.dequeue	=	noop_dequeue,
314 	.flags		=	TCQ_F_BUILTIN,
315 	.ops		=	&noqueue_qdisc_ops,
316 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
317 };
318 
319 
320 static const u8 prio2band[TC_PRIO_MAX+1] =
321 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
322 
323 /* 3-band FIFO queue: old style, but should be a bit faster than
324    generic prio+fifo combination.
325  */
326 
327 #define PFIFO_FAST_BANDS 3
328 
329 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
330 					     struct Qdisc *qdisc)
331 {
332 	struct sk_buff_head *list = qdisc_priv(qdisc);
333 	return list + prio2band[skb->priority & TC_PRIO_MAX];
334 }
335 
336 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
337 {
338 	struct sk_buff_head *list = prio2list(skb, qdisc);
339 
340 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
341 		qdisc->q.qlen++;
342 		return __qdisc_enqueue_tail(skb, qdisc, list);
343 	}
344 
345 	return qdisc_drop(skb, qdisc);
346 }
347 
348 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
349 {
350 	int prio;
351 	struct sk_buff_head *list = qdisc_priv(qdisc);
352 
353 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
354 		if (!skb_queue_empty(list + prio)) {
355 			qdisc->q.qlen--;
356 			return __qdisc_dequeue_head(qdisc, list + prio);
357 		}
358 	}
359 
360 	return NULL;
361 }
362 
363 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
364 {
365 	qdisc->q.qlen++;
366 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
367 }
368 
369 static void pfifo_fast_reset(struct Qdisc* qdisc)
370 {
371 	int prio;
372 	struct sk_buff_head *list = qdisc_priv(qdisc);
373 
374 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
375 		__qdisc_reset_queue(qdisc, list + prio);
376 
377 	qdisc->qstats.backlog = 0;
378 	qdisc->q.qlen = 0;
379 }
380 
381 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
382 {
383 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
384 
385 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
386 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
387 	return skb->len;
388 
389 rtattr_failure:
390 	return -1;
391 }
392 
393 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
394 {
395 	int prio;
396 	struct sk_buff_head *list = qdisc_priv(qdisc);
397 
398 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
399 		skb_queue_head_init(list + prio);
400 
401 	return 0;
402 }
403 
404 static struct Qdisc_ops pfifo_fast_ops = {
405 	.id		=	"pfifo_fast",
406 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
407 	.enqueue	=	pfifo_fast_enqueue,
408 	.dequeue	=	pfifo_fast_dequeue,
409 	.requeue	=	pfifo_fast_requeue,
410 	.init		=	pfifo_fast_init,
411 	.reset		=	pfifo_fast_reset,
412 	.dump		=	pfifo_fast_dump,
413 	.owner		=	THIS_MODULE,
414 };
415 
416 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
417 {
418 	void *p;
419 	struct Qdisc *sch;
420 	unsigned int size;
421 	int err = -ENOBUFS;
422 
423 	/* ensure that the Qdisc and the private data are 32-byte aligned */
424 	size = QDISC_ALIGN(sizeof(*sch));
425 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
426 
427 	p = kzalloc(size, GFP_KERNEL);
428 	if (!p)
429 		goto errout;
430 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
431 	sch->padded = (char *) sch - (char *) p;
432 
433 	INIT_LIST_HEAD(&sch->list);
434 	skb_queue_head_init(&sch->q);
435 	sch->ops = ops;
436 	sch->enqueue = ops->enqueue;
437 	sch->dequeue = ops->dequeue;
438 	sch->dev = dev;
439 	dev_hold(dev);
440 	atomic_set(&sch->refcnt, 1);
441 
442 	return sch;
443 errout:
444 	return ERR_PTR(-err);
445 }
446 
447 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
448 				 unsigned int parentid)
449 {
450 	struct Qdisc *sch;
451 
452 	sch = qdisc_alloc(dev, ops);
453 	if (IS_ERR(sch))
454 		goto errout;
455 	sch->stats_lock = &dev->queue_lock;
456 	sch->parent = parentid;
457 
458 	if (!ops->init || ops->init(sch, NULL) == 0)
459 		return sch;
460 
461 	qdisc_destroy(sch);
462 errout:
463 	return NULL;
464 }
465 
466 /* Under dev->queue_lock and BH! */
467 
468 void qdisc_reset(struct Qdisc *qdisc)
469 {
470 	struct Qdisc_ops *ops = qdisc->ops;
471 
472 	if (ops->reset)
473 		ops->reset(qdisc);
474 }
475 
476 /* this is the rcu callback function to clean up a qdisc when there
477  * are no further references to it */
478 
479 static void __qdisc_destroy(struct rcu_head *head)
480 {
481 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
482 	kfree((char *) qdisc - qdisc->padded);
483 }
484 
485 /* Under dev->queue_lock and BH! */
486 
487 void qdisc_destroy(struct Qdisc *qdisc)
488 {
489 	struct Qdisc_ops  *ops = qdisc->ops;
490 
491 	if (qdisc->flags & TCQ_F_BUILTIN ||
492 	    !atomic_dec_and_test(&qdisc->refcnt))
493 		return;
494 
495 	list_del(&qdisc->list);
496 #ifdef CONFIG_NET_ESTIMATOR
497 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
498 #endif
499 	if (ops->reset)
500 		ops->reset(qdisc);
501 	if (ops->destroy)
502 		ops->destroy(qdisc);
503 
504 	module_put(ops->owner);
505 	dev_put(qdisc->dev);
506 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
507 }
508 
509 void dev_activate(struct net_device *dev)
510 {
511 	/* No queueing discipline is attached to device;
512 	   create default one i.e. pfifo_fast for devices,
513 	   which need queueing and noqueue_qdisc for
514 	   virtual interfaces
515 	 */
516 
517 	if (dev->qdisc_sleeping == &noop_qdisc) {
518 		struct Qdisc *qdisc;
519 		if (dev->tx_queue_len) {
520 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
521 						  TC_H_ROOT);
522 			if (qdisc == NULL) {
523 				printk(KERN_INFO "%s: activation failed\n", dev->name);
524 				return;
525 			}
526 			list_add_tail(&qdisc->list, &dev->qdisc_list);
527 		} else {
528 			qdisc =  &noqueue_qdisc;
529 		}
530 		dev->qdisc_sleeping = qdisc;
531 	}
532 
533 	if (!netif_carrier_ok(dev))
534 		/* Delay activation until next carrier-on event */
535 		return;
536 
537 	spin_lock_bh(&dev->queue_lock);
538 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
539 	if (dev->qdisc != &noqueue_qdisc) {
540 		dev->trans_start = jiffies;
541 		dev_watchdog_up(dev);
542 	}
543 	spin_unlock_bh(&dev->queue_lock);
544 }
545 
546 void dev_deactivate(struct net_device *dev)
547 {
548 	struct Qdisc *qdisc;
549 	struct sk_buff *skb;
550 
551 	spin_lock_bh(&dev->queue_lock);
552 	qdisc = dev->qdisc;
553 	dev->qdisc = &noop_qdisc;
554 
555 	qdisc_reset(qdisc);
556 
557 	skb = dev->gso_skb;
558 	dev->gso_skb = NULL;
559 	spin_unlock_bh(&dev->queue_lock);
560 
561 	kfree_skb(skb);
562 
563 	dev_watchdog_down(dev);
564 
565 	/* Wait for outstanding dev_queue_xmit calls. */
566 	synchronize_rcu();
567 
568 	/* Wait for outstanding qdisc_run calls. */
569 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
570 		yield();
571 }
572 
573 void dev_init_scheduler(struct net_device *dev)
574 {
575 	qdisc_lock_tree(dev);
576 	dev->qdisc = &noop_qdisc;
577 	dev->qdisc_sleeping = &noop_qdisc;
578 	INIT_LIST_HEAD(&dev->qdisc_list);
579 	qdisc_unlock_tree(dev);
580 
581 	dev_watchdog_init(dev);
582 }
583 
584 void dev_shutdown(struct net_device *dev)
585 {
586 	struct Qdisc *qdisc;
587 
588 	qdisc_lock_tree(dev);
589 	qdisc = dev->qdisc_sleeping;
590 	dev->qdisc = &noop_qdisc;
591 	dev->qdisc_sleeping = &noop_qdisc;
592 	qdisc_destroy(qdisc);
593 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
594 	if ((qdisc = dev->qdisc_ingress) != NULL) {
595 		dev->qdisc_ingress = NULL;
596 		qdisc_destroy(qdisc);
597 	}
598 #endif
599 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
600 	qdisc_unlock_tree(dev);
601 }
602 
603 EXPORT_SYMBOL(netif_carrier_on);
604 EXPORT_SYMBOL(netif_carrier_off);
605 EXPORT_SYMBOL(noop_qdisc);
606 EXPORT_SYMBOL(qdisc_create_dflt);
607 EXPORT_SYMBOL(qdisc_destroy);
608 EXPORT_SYMBOL(qdisc_reset);
609 EXPORT_SYMBOL(qdisc_lock_tree);
610 EXPORT_SYMBOL(qdisc_unlock_tree);
611