xref: /openbmc/linux/net/sched/sch_generic.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/sched.h>
22 #include <linux/string.h>
23 #include <linux/mm.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/in.h>
27 #include <linux/errno.h>
28 #include <linux/interrupt.h>
29 #include <linux/netdevice.h>
30 #include <linux/skbuff.h>
31 #include <linux/rtnetlink.h>
32 #include <linux/init.h>
33 #include <linux/rcupdate.h>
34 #include <linux/list.h>
35 #include <net/sock.h>
36 #include <net/pkt_sched.h>
37 
38 /* Main transmission queue. */
39 
40 /* Main qdisc structure lock.
41 
42    However, modifications
43    to data, participating in scheduling must be additionally
44    protected with dev->queue_lock spinlock.
45 
46    The idea is the following:
47    - enqueue, dequeue are serialized via top level device
48      spinlock dev->queue_lock.
49    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
50      and this lock is used only in process context.
51    - updates to tree are made under rtnl semaphore or
52      from softirq context (__qdisc_destroy rcu-callback)
53      hence this lock needs local bh disabling.
54 
55    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
56  */
57 DEFINE_RWLOCK(qdisc_tree_lock);
58 
59 void qdisc_lock_tree(struct net_device *dev)
60 {
61 	write_lock_bh(&qdisc_tree_lock);
62 	spin_lock_bh(&dev->queue_lock);
63 }
64 
65 void qdisc_unlock_tree(struct net_device *dev)
66 {
67 	spin_unlock_bh(&dev->queue_lock);
68 	write_unlock_bh(&qdisc_tree_lock);
69 }
70 
71 /*
72    dev->queue_lock serializes queue accesses for this device
73    AND dev->qdisc pointer itself.
74 
75    dev->xmit_lock serializes accesses to device driver.
76 
77    dev->queue_lock and dev->xmit_lock are mutually exclusive,
78    if one is grabbed, another must be free.
79  */
80 
81 
82 /* Kick device.
83    Note, that this procedure can be called by a watchdog timer, so that
84    we do not check dev->tbusy flag here.
85 
86    Returns:  0  - queue is empty.
87             >0  - queue is not empty, but throttled.
88 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
89 
90    NOTE: Called under dev->queue_lock with locally disabled BH.
91 */
92 
93 int qdisc_restart(struct net_device *dev)
94 {
95 	struct Qdisc *q = dev->qdisc;
96 	struct sk_buff *skb;
97 
98 	/* Dequeue packet */
99 	if ((skb = q->dequeue(q)) != NULL) {
100 		unsigned nolock = (dev->features & NETIF_F_LLTX);
101 		/*
102 		 * When the driver has LLTX set it does its own locking
103 		 * in start_xmit. No need to add additional overhead by
104 		 * locking again. These checks are worth it because
105 		 * even uncongested locks can be quite expensive.
106 		 * The driver can do trylock like here too, in case
107 		 * of lock congestion it should return -1 and the packet
108 		 * will be requeued.
109 		 */
110 		if (!nolock) {
111 			if (!spin_trylock(&dev->xmit_lock)) {
112 			collision:
113 				/* So, someone grabbed the driver. */
114 
115 				/* It may be transient configuration error,
116 				   when hard_start_xmit() recurses. We detect
117 				   it by checking xmit owner and drop the
118 				   packet when deadloop is detected.
119 				*/
120 				if (dev->xmit_lock_owner == smp_processor_id()) {
121 					kfree_skb(skb);
122 					if (net_ratelimit())
123 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
124 					return -1;
125 				}
126 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
127 				goto requeue;
128 			}
129 			/* Remember that the driver is grabbed by us. */
130 			dev->xmit_lock_owner = smp_processor_id();
131 		}
132 
133 		{
134 			/* And release queue */
135 			spin_unlock(&dev->queue_lock);
136 
137 			if (!netif_queue_stopped(dev)) {
138 				int ret;
139 				if (netdev_nit)
140 					dev_queue_xmit_nit(skb, dev);
141 
142 				ret = dev->hard_start_xmit(skb, dev);
143 				if (ret == NETDEV_TX_OK) {
144 					if (!nolock) {
145 						dev->xmit_lock_owner = -1;
146 						spin_unlock(&dev->xmit_lock);
147 					}
148 					spin_lock(&dev->queue_lock);
149 					return -1;
150 				}
151 				if (ret == NETDEV_TX_LOCKED && nolock) {
152 					spin_lock(&dev->queue_lock);
153 					goto collision;
154 				}
155 			}
156 
157 			/* NETDEV_TX_BUSY - we need to requeue */
158 			/* Release the driver */
159 			if (!nolock) {
160 				dev->xmit_lock_owner = -1;
161 				spin_unlock(&dev->xmit_lock);
162 			}
163 			spin_lock(&dev->queue_lock);
164 			q = dev->qdisc;
165 		}
166 
167 		/* Device kicked us out :(
168 		   This is possible in three cases:
169 
170 		   0. driver is locked
171 		   1. fastroute is enabled
172 		   2. device cannot determine busy state
173 		      before start of transmission (f.e. dialout)
174 		   3. device is buggy (ppp)
175 		 */
176 
177 requeue:
178 		q->ops->requeue(skb, q);
179 		netif_schedule(dev);
180 		return 1;
181 	}
182 	return q->q.qlen;
183 }
184 
185 static void dev_watchdog(unsigned long arg)
186 {
187 	struct net_device *dev = (struct net_device *)arg;
188 
189 	spin_lock(&dev->xmit_lock);
190 	if (dev->qdisc != &noop_qdisc) {
191 		if (netif_device_present(dev) &&
192 		    netif_running(dev) &&
193 		    netif_carrier_ok(dev)) {
194 			if (netif_queue_stopped(dev) &&
195 			    (jiffies - dev->trans_start) > dev->watchdog_timeo) {
196 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name);
197 				dev->tx_timeout(dev);
198 			}
199 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
200 				dev_hold(dev);
201 		}
202 	}
203 	spin_unlock(&dev->xmit_lock);
204 
205 	dev_put(dev);
206 }
207 
208 static void dev_watchdog_init(struct net_device *dev)
209 {
210 	init_timer(&dev->watchdog_timer);
211 	dev->watchdog_timer.data = (unsigned long)dev;
212 	dev->watchdog_timer.function = dev_watchdog;
213 }
214 
215 void __netdev_watchdog_up(struct net_device *dev)
216 {
217 	if (dev->tx_timeout) {
218 		if (dev->watchdog_timeo <= 0)
219 			dev->watchdog_timeo = 5*HZ;
220 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
221 			dev_hold(dev);
222 	}
223 }
224 
225 static void dev_watchdog_up(struct net_device *dev)
226 {
227 	spin_lock_bh(&dev->xmit_lock);
228 	__netdev_watchdog_up(dev);
229 	spin_unlock_bh(&dev->xmit_lock);
230 }
231 
232 static void dev_watchdog_down(struct net_device *dev)
233 {
234 	spin_lock_bh(&dev->xmit_lock);
235 	if (del_timer(&dev->watchdog_timer))
236 		__dev_put(dev);
237 	spin_unlock_bh(&dev->xmit_lock);
238 }
239 
240 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
241    under all circumstances. It is difficult to invent anything faster or
242    cheaper.
243  */
244 
245 static int
246 noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
247 {
248 	kfree_skb(skb);
249 	return NET_XMIT_CN;
250 }
251 
252 static struct sk_buff *
253 noop_dequeue(struct Qdisc * qdisc)
254 {
255 	return NULL;
256 }
257 
258 static int
259 noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
260 {
261 	if (net_ratelimit())
262 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
263 	kfree_skb(skb);
264 	return NET_XMIT_CN;
265 }
266 
267 struct Qdisc_ops noop_qdisc_ops = {
268 	.next		=	NULL,
269 	.cl_ops		=	NULL,
270 	.id		=	"noop",
271 	.priv_size	=	0,
272 	.enqueue	=	noop_enqueue,
273 	.dequeue	=	noop_dequeue,
274 	.requeue	=	noop_requeue,
275 	.owner		=	THIS_MODULE,
276 };
277 
278 struct Qdisc noop_qdisc = {
279 	.enqueue	=	noop_enqueue,
280 	.dequeue	=	noop_dequeue,
281 	.flags		=	TCQ_F_BUILTIN,
282 	.ops		=	&noop_qdisc_ops,
283 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
284 };
285 
286 static struct Qdisc_ops noqueue_qdisc_ops = {
287 	.next		=	NULL,
288 	.cl_ops		=	NULL,
289 	.id		=	"noqueue",
290 	.priv_size	=	0,
291 	.enqueue	=	noop_enqueue,
292 	.dequeue	=	noop_dequeue,
293 	.requeue	=	noop_requeue,
294 	.owner		=	THIS_MODULE,
295 };
296 
297 static struct Qdisc noqueue_qdisc = {
298 	.enqueue	=	NULL,
299 	.dequeue	=	noop_dequeue,
300 	.flags		=	TCQ_F_BUILTIN,
301 	.ops		=	&noqueue_qdisc_ops,
302 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
303 };
304 
305 
306 static const u8 prio2band[TC_PRIO_MAX+1] =
307 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
308 
309 /* 3-band FIFO queue: old style, but should be a bit faster than
310    generic prio+fifo combination.
311  */
312 
313 static int
314 pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
315 {
316 	struct sk_buff_head *list = qdisc_priv(qdisc);
317 
318 	list += prio2band[skb->priority&TC_PRIO_MAX];
319 
320 	if (list->qlen < qdisc->dev->tx_queue_len) {
321 		__skb_queue_tail(list, skb);
322 		qdisc->q.qlen++;
323 		qdisc->bstats.bytes += skb->len;
324 		qdisc->bstats.packets++;
325 		return 0;
326 	}
327 	qdisc->qstats.drops++;
328 	kfree_skb(skb);
329 	return NET_XMIT_DROP;
330 }
331 
332 static struct sk_buff *
333 pfifo_fast_dequeue(struct Qdisc* qdisc)
334 {
335 	int prio;
336 	struct sk_buff_head *list = qdisc_priv(qdisc);
337 	struct sk_buff *skb;
338 
339 	for (prio = 0; prio < 3; prio++, list++) {
340 		skb = __skb_dequeue(list);
341 		if (skb) {
342 			qdisc->q.qlen--;
343 			return skb;
344 		}
345 	}
346 	return NULL;
347 }
348 
349 static int
350 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
351 {
352 	struct sk_buff_head *list = qdisc_priv(qdisc);
353 
354 	list += prio2band[skb->priority&TC_PRIO_MAX];
355 
356 	__skb_queue_head(list, skb);
357 	qdisc->q.qlen++;
358 	qdisc->qstats.requeues++;
359 	return 0;
360 }
361 
362 static void
363 pfifo_fast_reset(struct Qdisc* qdisc)
364 {
365 	int prio;
366 	struct sk_buff_head *list = qdisc_priv(qdisc);
367 
368 	for (prio=0; prio < 3; prio++)
369 		skb_queue_purge(list+prio);
370 	qdisc->q.qlen = 0;
371 }
372 
373 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
374 {
375 	unsigned char	 *b = skb->tail;
376 	struct tc_prio_qopt opt;
377 
378 	opt.bands = 3;
379 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
380 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
381 	return skb->len;
382 
383 rtattr_failure:
384 	skb_trim(skb, b - skb->data);
385 	return -1;
386 }
387 
388 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
389 {
390 	int i;
391 	struct sk_buff_head *list = qdisc_priv(qdisc);
392 
393 	for (i=0; i<3; i++)
394 		skb_queue_head_init(list+i);
395 
396 	return 0;
397 }
398 
399 static struct Qdisc_ops pfifo_fast_ops = {
400 	.next		=	NULL,
401 	.cl_ops		=	NULL,
402 	.id		=	"pfifo_fast",
403 	.priv_size	=	3 * sizeof(struct sk_buff_head),
404 	.enqueue	=	pfifo_fast_enqueue,
405 	.dequeue	=	pfifo_fast_dequeue,
406 	.requeue	=	pfifo_fast_requeue,
407 	.init		=	pfifo_fast_init,
408 	.reset		=	pfifo_fast_reset,
409 	.dump		=	pfifo_fast_dump,
410 	.owner		=	THIS_MODULE,
411 };
412 
413 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
414 {
415 	void *p;
416 	struct Qdisc *sch;
417 	int size;
418 
419 	/* ensure that the Qdisc and the private data are 32-byte aligned */
420 	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
421 	size += ops->priv_size + QDISC_ALIGN_CONST;
422 
423 	p = kmalloc(size, GFP_KERNEL);
424 	if (!p)
425 		return NULL;
426 	memset(p, 0, size);
427 
428 	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
429 			       & ~QDISC_ALIGN_CONST);
430 	sch->padded = (char *)sch - (char *)p;
431 
432 	INIT_LIST_HEAD(&sch->list);
433 	skb_queue_head_init(&sch->q);
434 	sch->ops = ops;
435 	sch->enqueue = ops->enqueue;
436 	sch->dequeue = ops->dequeue;
437 	sch->dev = dev;
438 	dev_hold(dev);
439 	sch->stats_lock = &dev->queue_lock;
440 	atomic_set(&sch->refcnt, 1);
441 	if (!ops->init || ops->init(sch, NULL) == 0)
442 		return sch;
443 
444 	dev_put(dev);
445 	kfree(p);
446 	return NULL;
447 }
448 
449 /* Under dev->queue_lock and BH! */
450 
451 void qdisc_reset(struct Qdisc *qdisc)
452 {
453 	struct Qdisc_ops *ops = qdisc->ops;
454 
455 	if (ops->reset)
456 		ops->reset(qdisc);
457 }
458 
459 /* this is the rcu callback function to clean up a qdisc when there
460  * are no further references to it */
461 
462 static void __qdisc_destroy(struct rcu_head *head)
463 {
464 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
465 	struct Qdisc_ops  *ops = qdisc->ops;
466 
467 #ifdef CONFIG_NET_ESTIMATOR
468 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
469 #endif
470 	write_lock(&qdisc_tree_lock);
471 	if (ops->reset)
472 		ops->reset(qdisc);
473 	if (ops->destroy)
474 		ops->destroy(qdisc);
475 	write_unlock(&qdisc_tree_lock);
476 	module_put(ops->owner);
477 
478 	dev_put(qdisc->dev);
479 	kfree((char *) qdisc - qdisc->padded);
480 }
481 
482 /* Under dev->queue_lock and BH! */
483 
484 void qdisc_destroy(struct Qdisc *qdisc)
485 {
486 	struct list_head cql = LIST_HEAD_INIT(cql);
487 	struct Qdisc *cq, *q, *n;
488 
489 	if (qdisc->flags & TCQ_F_BUILTIN ||
490 		!atomic_dec_and_test(&qdisc->refcnt))
491 		return;
492 
493 	if (!list_empty(&qdisc->list)) {
494 		if (qdisc->ops->cl_ops == NULL)
495 			list_del(&qdisc->list);
496 		else
497 			list_move(&qdisc->list, &cql);
498 	}
499 
500 	/* unlink inner qdiscs from dev->qdisc_list immediately */
501 	list_for_each_entry(cq, &cql, list)
502 		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
503 			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
504 				if (q->ops->cl_ops == NULL)
505 					list_del_init(&q->list);
506 				else
507 					list_move_tail(&q->list, &cql);
508 			}
509 	list_for_each_entry_safe(cq, n, &cql, list)
510 		list_del_init(&cq->list);
511 
512 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
513 }
514 
515 void dev_activate(struct net_device *dev)
516 {
517 	/* No queueing discipline is attached to device;
518 	   create default one i.e. pfifo_fast for devices,
519 	   which need queueing and noqueue_qdisc for
520 	   virtual interfaces
521 	 */
522 
523 	if (dev->qdisc_sleeping == &noop_qdisc) {
524 		struct Qdisc *qdisc;
525 		if (dev->tx_queue_len) {
526 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
527 			if (qdisc == NULL) {
528 				printk(KERN_INFO "%s: activation failed\n", dev->name);
529 				return;
530 			}
531 			write_lock_bh(&qdisc_tree_lock);
532 			list_add_tail(&qdisc->list, &dev->qdisc_list);
533 			write_unlock_bh(&qdisc_tree_lock);
534 		} else {
535 			qdisc =  &noqueue_qdisc;
536 		}
537 		write_lock_bh(&qdisc_tree_lock);
538 		dev->qdisc_sleeping = qdisc;
539 		write_unlock_bh(&qdisc_tree_lock);
540 	}
541 
542 	spin_lock_bh(&dev->queue_lock);
543 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
544 	if (dev->qdisc != &noqueue_qdisc) {
545 		dev->trans_start = jiffies;
546 		dev_watchdog_up(dev);
547 	}
548 	spin_unlock_bh(&dev->queue_lock);
549 }
550 
551 void dev_deactivate(struct net_device *dev)
552 {
553 	struct Qdisc *qdisc;
554 
555 	spin_lock_bh(&dev->queue_lock);
556 	qdisc = dev->qdisc;
557 	dev->qdisc = &noop_qdisc;
558 
559 	qdisc_reset(qdisc);
560 
561 	spin_unlock_bh(&dev->queue_lock);
562 
563 	dev_watchdog_down(dev);
564 
565 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
566 		yield();
567 
568 	spin_unlock_wait(&dev->xmit_lock);
569 }
570 
571 void dev_init_scheduler(struct net_device *dev)
572 {
573 	qdisc_lock_tree(dev);
574 	dev->qdisc = &noop_qdisc;
575 	dev->qdisc_sleeping = &noop_qdisc;
576 	INIT_LIST_HEAD(&dev->qdisc_list);
577 	qdisc_unlock_tree(dev);
578 
579 	dev_watchdog_init(dev);
580 }
581 
582 void dev_shutdown(struct net_device *dev)
583 {
584 	struct Qdisc *qdisc;
585 
586 	qdisc_lock_tree(dev);
587 	qdisc = dev->qdisc_sleeping;
588 	dev->qdisc = &noop_qdisc;
589 	dev->qdisc_sleeping = &noop_qdisc;
590 	qdisc_destroy(qdisc);
591 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
592         if ((qdisc = dev->qdisc_ingress) != NULL) {
593 		dev->qdisc_ingress = NULL;
594 		qdisc_destroy(qdisc);
595         }
596 #endif
597 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
598 	qdisc_unlock_tree(dev);
599 }
600 
601 EXPORT_SYMBOL(__netdev_watchdog_up);
602 EXPORT_SYMBOL(noop_qdisc);
603 EXPORT_SYMBOL(noop_qdisc_ops);
604 EXPORT_SYMBOL(qdisc_create_dflt);
605 EXPORT_SYMBOL(qdisc_destroy);
606 EXPORT_SYMBOL(qdisc_reset);
607 EXPORT_SYMBOL(qdisc_restart);
608 EXPORT_SYMBOL(qdisc_lock_tree);
609 EXPORT_SYMBOL(qdisc_unlock_tree);
610