xref: /openbmc/linux/net/sched/sch_generic.c (revision 0463d4ae)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Modifications to data participating in scheduling must be protected with
40  * dev->queue_lock spinlock.
41  *
42  * The idea is the following:
43  * - enqueue, dequeue are serialized via top level device
44  *   spinlock dev->queue_lock.
45  * - updates to tree and tree walking are only done under the rtnl mutex.
46  */
47 
48 void qdisc_lock_tree(struct net_device *dev)
49 {
50 	spin_lock_bh(&dev->queue_lock);
51 }
52 
53 void qdisc_unlock_tree(struct net_device *dev)
54 {
55 	spin_unlock_bh(&dev->queue_lock);
56 }
57 
58 /*
59    dev->queue_lock serializes queue accesses for this device
60    AND dev->qdisc pointer itself.
61 
62    netif_tx_lock serializes accesses to device driver.
63 
64    dev->queue_lock and netif_tx_lock are mutually exclusive,
65    if one is grabbed, another must be free.
66  */
67 
68 
69 /* Kick device.
70    Note, that this procedure can be called by a watchdog timer, so that
71    we do not check dev->tbusy flag here.
72 
73    Returns:  0  - queue is empty.
74 	    >0  - queue is not empty, but throttled.
75 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
76 
77    NOTE: Called under dev->queue_lock with locally disabled BH.
78 */
79 
80 static inline int qdisc_restart(struct net_device *dev)
81 {
82 	struct Qdisc *q = dev->qdisc;
83 	struct sk_buff *skb;
84 
85 	/* Dequeue packet */
86 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
87 		unsigned nolock = (dev->features & NETIF_F_LLTX);
88 
89 		dev->gso_skb = NULL;
90 
91 		/*
92 		 * When the driver has LLTX set it does its own locking
93 		 * in start_xmit. No need to add additional overhead by
94 		 * locking again. These checks are worth it because
95 		 * even uncongested locks can be quite expensive.
96 		 * The driver can do trylock like here too, in case
97 		 * of lock congestion it should return -1 and the packet
98 		 * will be requeued.
99 		 */
100 		if (!nolock) {
101 			if (!netif_tx_trylock(dev)) {
102 			collision:
103 				/* So, someone grabbed the driver. */
104 
105 				/* It may be transient configuration error,
106 				   when hard_start_xmit() recurses. We detect
107 				   it by checking xmit owner and drop the
108 				   packet when deadloop is detected.
109 				*/
110 				if (dev->xmit_lock_owner == smp_processor_id()) {
111 					kfree_skb(skb);
112 					if (net_ratelimit())
113 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
114 					return -1;
115 				}
116 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
117 				goto requeue;
118 			}
119 		}
120 
121 		{
122 			/* And release queue */
123 			spin_unlock(&dev->queue_lock);
124 
125 			if (!netif_queue_stopped(dev)) {
126 				int ret;
127 
128 				ret = dev_hard_start_xmit(skb, dev);
129 				if (ret == NETDEV_TX_OK) {
130 					if (!nolock) {
131 						netif_tx_unlock(dev);
132 					}
133 					spin_lock(&dev->queue_lock);
134 					return -1;
135 				}
136 				if (ret == NETDEV_TX_LOCKED && nolock) {
137 					spin_lock(&dev->queue_lock);
138 					goto collision;
139 				}
140 			}
141 
142 			/* NETDEV_TX_BUSY - we need to requeue */
143 			/* Release the driver */
144 			if (!nolock) {
145 				netif_tx_unlock(dev);
146 			}
147 			spin_lock(&dev->queue_lock);
148 			q = dev->qdisc;
149 		}
150 
151 		/* Device kicked us out :(
152 		   This is possible in three cases:
153 
154 		   0. driver is locked
155 		   1. fastroute is enabled
156 		   2. device cannot determine busy state
157 		      before start of transmission (f.e. dialout)
158 		   3. device is buggy (ppp)
159 		 */
160 
161 requeue:
162 		if (skb->next)
163 			dev->gso_skb = skb;
164 		else
165 			q->ops->requeue(skb, q);
166 		netif_schedule(dev);
167 		return 1;
168 	}
169 	BUG_ON((int) q->q.qlen < 0);
170 	return q->q.qlen;
171 }
172 
173 void __qdisc_run(struct net_device *dev)
174 {
175 	if (unlikely(dev->qdisc == &noop_qdisc))
176 		goto out;
177 
178 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
179 		/* NOTHING */;
180 
181 out:
182 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
183 }
184 
185 static void dev_watchdog(unsigned long arg)
186 {
187 	struct net_device *dev = (struct net_device *)arg;
188 
189 	netif_tx_lock(dev);
190 	if (dev->qdisc != &noop_qdisc) {
191 		if (netif_device_present(dev) &&
192 		    netif_running(dev) &&
193 		    netif_carrier_ok(dev)) {
194 			if (netif_queue_stopped(dev) &&
195 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
196 
197 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
198 				       dev->name);
199 				dev->tx_timeout(dev);
200 			}
201 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
202 				dev_hold(dev);
203 		}
204 	}
205 	netif_tx_unlock(dev);
206 
207 	dev_put(dev);
208 }
209 
210 static void dev_watchdog_init(struct net_device *dev)
211 {
212 	init_timer(&dev->watchdog_timer);
213 	dev->watchdog_timer.data = (unsigned long)dev;
214 	dev->watchdog_timer.function = dev_watchdog;
215 }
216 
217 void __netdev_watchdog_up(struct net_device *dev)
218 {
219 	if (dev->tx_timeout) {
220 		if (dev->watchdog_timeo <= 0)
221 			dev->watchdog_timeo = 5*HZ;
222 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
223 			dev_hold(dev);
224 	}
225 }
226 
227 static void dev_watchdog_up(struct net_device *dev)
228 {
229 	__netdev_watchdog_up(dev);
230 }
231 
232 static void dev_watchdog_down(struct net_device *dev)
233 {
234 	netif_tx_lock_bh(dev);
235 	if (del_timer(&dev->watchdog_timer))
236 		dev_put(dev);
237 	netif_tx_unlock_bh(dev);
238 }
239 
240 void netif_carrier_on(struct net_device *dev)
241 {
242 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
243 		linkwatch_fire_event(dev);
244 	if (netif_running(dev))
245 		__netdev_watchdog_up(dev);
246 }
247 
248 void netif_carrier_off(struct net_device *dev)
249 {
250 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
251 		linkwatch_fire_event(dev);
252 }
253 
254 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
255    under all circumstances. It is difficult to invent anything faster or
256    cheaper.
257  */
258 
259 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
260 {
261 	kfree_skb(skb);
262 	return NET_XMIT_CN;
263 }
264 
265 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
266 {
267 	return NULL;
268 }
269 
270 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
271 {
272 	if (net_ratelimit())
273 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
274 		       skb->dev->name);
275 	kfree_skb(skb);
276 	return NET_XMIT_CN;
277 }
278 
279 struct Qdisc_ops noop_qdisc_ops = {
280 	.id		=	"noop",
281 	.priv_size	=	0,
282 	.enqueue	=	noop_enqueue,
283 	.dequeue	=	noop_dequeue,
284 	.requeue	=	noop_requeue,
285 	.owner		=	THIS_MODULE,
286 };
287 
288 struct Qdisc noop_qdisc = {
289 	.enqueue	=	noop_enqueue,
290 	.dequeue	=	noop_dequeue,
291 	.flags		=	TCQ_F_BUILTIN,
292 	.ops		=	&noop_qdisc_ops,
293 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
294 };
295 
296 static struct Qdisc_ops noqueue_qdisc_ops = {
297 	.id		=	"noqueue",
298 	.priv_size	=	0,
299 	.enqueue	=	noop_enqueue,
300 	.dequeue	=	noop_dequeue,
301 	.requeue	=	noop_requeue,
302 	.owner		=	THIS_MODULE,
303 };
304 
305 static struct Qdisc noqueue_qdisc = {
306 	.enqueue	=	NULL,
307 	.dequeue	=	noop_dequeue,
308 	.flags		=	TCQ_F_BUILTIN,
309 	.ops		=	&noqueue_qdisc_ops,
310 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
311 };
312 
313 
314 static const u8 prio2band[TC_PRIO_MAX+1] =
315 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
316 
317 /* 3-band FIFO queue: old style, but should be a bit faster than
318    generic prio+fifo combination.
319  */
320 
321 #define PFIFO_FAST_BANDS 3
322 
323 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
324 					     struct Qdisc *qdisc)
325 {
326 	struct sk_buff_head *list = qdisc_priv(qdisc);
327 	return list + prio2band[skb->priority & TC_PRIO_MAX];
328 }
329 
330 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
331 {
332 	struct sk_buff_head *list = prio2list(skb, qdisc);
333 
334 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
335 		qdisc->q.qlen++;
336 		return __qdisc_enqueue_tail(skb, qdisc, list);
337 	}
338 
339 	return qdisc_drop(skb, qdisc);
340 }
341 
342 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
343 {
344 	int prio;
345 	struct sk_buff_head *list = qdisc_priv(qdisc);
346 
347 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
348 		if (!skb_queue_empty(list + prio)) {
349 			qdisc->q.qlen--;
350 			return __qdisc_dequeue_head(qdisc, list + prio);
351 		}
352 	}
353 
354 	return NULL;
355 }
356 
357 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
358 {
359 	qdisc->q.qlen++;
360 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
361 }
362 
363 static void pfifo_fast_reset(struct Qdisc* qdisc)
364 {
365 	int prio;
366 	struct sk_buff_head *list = qdisc_priv(qdisc);
367 
368 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
369 		__qdisc_reset_queue(qdisc, list + prio);
370 
371 	qdisc->qstats.backlog = 0;
372 	qdisc->q.qlen = 0;
373 }
374 
375 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
376 {
377 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
378 
379 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
380 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
381 	return skb->len;
382 
383 rtattr_failure:
384 	return -1;
385 }
386 
387 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
388 {
389 	int prio;
390 	struct sk_buff_head *list = qdisc_priv(qdisc);
391 
392 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
393 		skb_queue_head_init(list + prio);
394 
395 	return 0;
396 }
397 
398 static struct Qdisc_ops pfifo_fast_ops = {
399 	.id		=	"pfifo_fast",
400 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
401 	.enqueue	=	pfifo_fast_enqueue,
402 	.dequeue	=	pfifo_fast_dequeue,
403 	.requeue	=	pfifo_fast_requeue,
404 	.init		=	pfifo_fast_init,
405 	.reset		=	pfifo_fast_reset,
406 	.dump		=	pfifo_fast_dump,
407 	.owner		=	THIS_MODULE,
408 };
409 
410 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
411 {
412 	void *p;
413 	struct Qdisc *sch;
414 	unsigned int size;
415 	int err = -ENOBUFS;
416 
417 	/* ensure that the Qdisc and the private data are 32-byte aligned */
418 	size = QDISC_ALIGN(sizeof(*sch));
419 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
420 
421 	p = kzalloc(size, GFP_KERNEL);
422 	if (!p)
423 		goto errout;
424 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
425 	sch->padded = (char *) sch - (char *) p;
426 
427 	INIT_LIST_HEAD(&sch->list);
428 	skb_queue_head_init(&sch->q);
429 	sch->ops = ops;
430 	sch->enqueue = ops->enqueue;
431 	sch->dequeue = ops->dequeue;
432 	sch->dev = dev;
433 	dev_hold(dev);
434 	sch->stats_lock = &dev->queue_lock;
435 	atomic_set(&sch->refcnt, 1);
436 
437 	return sch;
438 errout:
439 	return ERR_PTR(-err);
440 }
441 
442 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
443 				 unsigned int parentid)
444 {
445 	struct Qdisc *sch;
446 
447 	sch = qdisc_alloc(dev, ops);
448 	if (IS_ERR(sch))
449 		goto errout;
450 	sch->parent = parentid;
451 
452 	if (!ops->init || ops->init(sch, NULL) == 0)
453 		return sch;
454 
455 	qdisc_destroy(sch);
456 errout:
457 	return NULL;
458 }
459 
460 /* Under dev->queue_lock and BH! */
461 
462 void qdisc_reset(struct Qdisc *qdisc)
463 {
464 	struct Qdisc_ops *ops = qdisc->ops;
465 
466 	if (ops->reset)
467 		ops->reset(qdisc);
468 }
469 
470 /* this is the rcu callback function to clean up a qdisc when there
471  * are no further references to it */
472 
473 static void __qdisc_destroy(struct rcu_head *head)
474 {
475 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
476 	kfree((char *) qdisc - qdisc->padded);
477 }
478 
479 /* Under dev->queue_lock and BH! */
480 
481 void qdisc_destroy(struct Qdisc *qdisc)
482 {
483 	struct Qdisc_ops  *ops = qdisc->ops;
484 
485 	if (qdisc->flags & TCQ_F_BUILTIN ||
486 	    !atomic_dec_and_test(&qdisc->refcnt))
487 		return;
488 
489 	list_del(&qdisc->list);
490 #ifdef CONFIG_NET_ESTIMATOR
491 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
492 #endif
493 	if (ops->reset)
494 		ops->reset(qdisc);
495 	if (ops->destroy)
496 		ops->destroy(qdisc);
497 
498 	module_put(ops->owner);
499 	dev_put(qdisc->dev);
500 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
501 }
502 
503 void dev_activate(struct net_device *dev)
504 {
505 	/* No queueing discipline is attached to device;
506 	   create default one i.e. pfifo_fast for devices,
507 	   which need queueing and noqueue_qdisc for
508 	   virtual interfaces
509 	 */
510 
511 	if (dev->qdisc_sleeping == &noop_qdisc) {
512 		struct Qdisc *qdisc;
513 		if (dev->tx_queue_len) {
514 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
515 						  TC_H_ROOT);
516 			if (qdisc == NULL) {
517 				printk(KERN_INFO "%s: activation failed\n", dev->name);
518 				return;
519 			}
520 			list_add_tail(&qdisc->list, &dev->qdisc_list);
521 		} else {
522 			qdisc =  &noqueue_qdisc;
523 		}
524 		dev->qdisc_sleeping = qdisc;
525 	}
526 
527 	if (!netif_carrier_ok(dev))
528 		/* Delay activation until next carrier-on event */
529 		return;
530 
531 	spin_lock_bh(&dev->queue_lock);
532 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
533 	if (dev->qdisc != &noqueue_qdisc) {
534 		dev->trans_start = jiffies;
535 		dev_watchdog_up(dev);
536 	}
537 	spin_unlock_bh(&dev->queue_lock);
538 }
539 
540 void dev_deactivate(struct net_device *dev)
541 {
542 	struct Qdisc *qdisc;
543 
544 	spin_lock_bh(&dev->queue_lock);
545 	qdisc = dev->qdisc;
546 	dev->qdisc = &noop_qdisc;
547 
548 	qdisc_reset(qdisc);
549 
550 	spin_unlock_bh(&dev->queue_lock);
551 
552 	dev_watchdog_down(dev);
553 
554 	/* Wait for outstanding dev_queue_xmit calls. */
555 	synchronize_rcu();
556 
557 	/* Wait for outstanding qdisc_run calls. */
558 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
559 		yield();
560 
561 	if (dev->gso_skb) {
562 		kfree_skb(dev->gso_skb);
563 		dev->gso_skb = NULL;
564 	}
565 }
566 
567 void dev_init_scheduler(struct net_device *dev)
568 {
569 	qdisc_lock_tree(dev);
570 	dev->qdisc = &noop_qdisc;
571 	dev->qdisc_sleeping = &noop_qdisc;
572 	INIT_LIST_HEAD(&dev->qdisc_list);
573 	qdisc_unlock_tree(dev);
574 
575 	dev_watchdog_init(dev);
576 }
577 
578 void dev_shutdown(struct net_device *dev)
579 {
580 	struct Qdisc *qdisc;
581 
582 	qdisc_lock_tree(dev);
583 	qdisc = dev->qdisc_sleeping;
584 	dev->qdisc = &noop_qdisc;
585 	dev->qdisc_sleeping = &noop_qdisc;
586 	qdisc_destroy(qdisc);
587 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
588 	if ((qdisc = dev->qdisc_ingress) != NULL) {
589 		dev->qdisc_ingress = NULL;
590 		qdisc_destroy(qdisc);
591 	}
592 #endif
593 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
594 	qdisc_unlock_tree(dev);
595 }
596 
597 EXPORT_SYMBOL(netif_carrier_on);
598 EXPORT_SYMBOL(netif_carrier_off);
599 EXPORT_SYMBOL(noop_qdisc);
600 EXPORT_SYMBOL(qdisc_create_dflt);
601 EXPORT_SYMBOL(qdisc_destroy);
602 EXPORT_SYMBOL(qdisc_reset);
603 EXPORT_SYMBOL(qdisc_lock_tree);
604 EXPORT_SYMBOL(qdisc_unlock_tree);
605