xref: /openbmc/linux/net/sched/sch_generic.c (revision 9f9afec4)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Main qdisc structure lock.
40 
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44 
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made only under rtnl semaphore,
51      hence this lock may be made without local bh disabling.
52 
53    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
54  */
55 DEFINE_RWLOCK(qdisc_tree_lock);
56 
57 void qdisc_lock_tree(struct net_device *dev)
58 {
59 	write_lock(&qdisc_tree_lock);
60 	spin_lock_bh(&dev->queue_lock);
61 }
62 
63 void qdisc_unlock_tree(struct net_device *dev)
64 {
65 	spin_unlock_bh(&dev->queue_lock);
66 	write_unlock(&qdisc_tree_lock);
67 }
68 
69 /*
70    dev->queue_lock serializes queue accesses for this device
71    AND dev->qdisc pointer itself.
72 
73    netif_tx_lock serializes accesses to device driver.
74 
75    dev->queue_lock and netif_tx_lock are mutually exclusive,
76    if one is grabbed, another must be free.
77  */
78 
79 
80 /* Kick device.
81    Note, that this procedure can be called by a watchdog timer, so that
82    we do not check dev->tbusy flag here.
83 
84    Returns:  0  - queue is empty.
85             >0  - queue is not empty, but throttled.
86 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
87 
88    NOTE: Called under dev->queue_lock with locally disabled BH.
89 */
90 
91 static inline int qdisc_restart(struct net_device *dev)
92 {
93 	struct Qdisc *q = dev->qdisc;
94 	struct sk_buff *skb;
95 
96 	/* Dequeue packet */
97 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
98 		unsigned nolock = (dev->features & NETIF_F_LLTX);
99 
100 		dev->gso_skb = NULL;
101 
102 		/*
103 		 * When the driver has LLTX set it does its own locking
104 		 * in start_xmit. No need to add additional overhead by
105 		 * locking again. These checks are worth it because
106 		 * even uncongested locks can be quite expensive.
107 		 * The driver can do trylock like here too, in case
108 		 * of lock congestion it should return -1 and the packet
109 		 * will be requeued.
110 		 */
111 		if (!nolock) {
112 			if (!netif_tx_trylock(dev)) {
113 			collision:
114 				/* So, someone grabbed the driver. */
115 
116 				/* It may be transient configuration error,
117 				   when hard_start_xmit() recurses. We detect
118 				   it by checking xmit owner and drop the
119 				   packet when deadloop is detected.
120 				*/
121 				if (dev->xmit_lock_owner == smp_processor_id()) {
122 					kfree_skb(skb);
123 					if (net_ratelimit())
124 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
125 					return -1;
126 				}
127 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
128 				goto requeue;
129 			}
130 		}
131 
132 		{
133 			/* And release queue */
134 			spin_unlock(&dev->queue_lock);
135 
136 			if (!netif_queue_stopped(dev)) {
137 				int ret;
138 
139 				ret = dev_hard_start_xmit(skb, dev);
140 				if (ret == NETDEV_TX_OK) {
141 					if (!nolock) {
142 						netif_tx_unlock(dev);
143 					}
144 					spin_lock(&dev->queue_lock);
145 					return -1;
146 				}
147 				if (ret == NETDEV_TX_LOCKED && nolock) {
148 					spin_lock(&dev->queue_lock);
149 					goto collision;
150 				}
151 			}
152 
153 			/* NETDEV_TX_BUSY - we need to requeue */
154 			/* Release the driver */
155 			if (!nolock) {
156 				netif_tx_unlock(dev);
157 			}
158 			spin_lock(&dev->queue_lock);
159 			q = dev->qdisc;
160 		}
161 
162 		/* Device kicked us out :(
163 		   This is possible in three cases:
164 
165 		   0. driver is locked
166 		   1. fastroute is enabled
167 		   2. device cannot determine busy state
168 		      before start of transmission (f.e. dialout)
169 		   3. device is buggy (ppp)
170 		 */
171 
172 requeue:
173 		if (skb->next)
174 			dev->gso_skb = skb;
175 		else
176 			q->ops->requeue(skb, q);
177 		netif_schedule(dev);
178 		return 1;
179 	}
180 	BUG_ON((int) q->q.qlen < 0);
181 	return q->q.qlen;
182 }
183 
184 void __qdisc_run(struct net_device *dev)
185 {
186 	if (unlikely(dev->qdisc == &noop_qdisc))
187 		goto out;
188 
189 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
190 		/* NOTHING */;
191 
192 out:
193 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
194 }
195 
196 static void dev_watchdog(unsigned long arg)
197 {
198 	struct net_device *dev = (struct net_device *)arg;
199 
200 	netif_tx_lock(dev);
201 	if (dev->qdisc != &noop_qdisc) {
202 		if (netif_device_present(dev) &&
203 		    netif_running(dev) &&
204 		    netif_carrier_ok(dev)) {
205 			if (netif_queue_stopped(dev) &&
206 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
207 
208 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
209 				       dev->name);
210 				dev->tx_timeout(dev);
211 			}
212 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
213 				dev_hold(dev);
214 		}
215 	}
216 	netif_tx_unlock(dev);
217 
218 	dev_put(dev);
219 }
220 
221 static void dev_watchdog_init(struct net_device *dev)
222 {
223 	init_timer(&dev->watchdog_timer);
224 	dev->watchdog_timer.data = (unsigned long)dev;
225 	dev->watchdog_timer.function = dev_watchdog;
226 }
227 
228 void __netdev_watchdog_up(struct net_device *dev)
229 {
230 	if (dev->tx_timeout) {
231 		if (dev->watchdog_timeo <= 0)
232 			dev->watchdog_timeo = 5*HZ;
233 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
234 			dev_hold(dev);
235 	}
236 }
237 
238 static void dev_watchdog_up(struct net_device *dev)
239 {
240 	__netdev_watchdog_up(dev);
241 }
242 
243 static void dev_watchdog_down(struct net_device *dev)
244 {
245 	netif_tx_lock_bh(dev);
246 	if (del_timer(&dev->watchdog_timer))
247 		dev_put(dev);
248 	netif_tx_unlock_bh(dev);
249 }
250 
251 void netif_carrier_on(struct net_device *dev)
252 {
253 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
254 		linkwatch_fire_event(dev);
255 	if (netif_running(dev))
256 		__netdev_watchdog_up(dev);
257 }
258 
259 void netif_carrier_off(struct net_device *dev)
260 {
261 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
262 		linkwatch_fire_event(dev);
263 }
264 
265 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
266    under all circumstances. It is difficult to invent anything faster or
267    cheaper.
268  */
269 
270 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
271 {
272 	kfree_skb(skb);
273 	return NET_XMIT_CN;
274 }
275 
276 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
277 {
278 	return NULL;
279 }
280 
281 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
282 {
283 	if (net_ratelimit())
284 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
285 		       skb->dev->name);
286 	kfree_skb(skb);
287 	return NET_XMIT_CN;
288 }
289 
290 struct Qdisc_ops noop_qdisc_ops = {
291 	.id		=	"noop",
292 	.priv_size	=	0,
293 	.enqueue	=	noop_enqueue,
294 	.dequeue	=	noop_dequeue,
295 	.requeue	=	noop_requeue,
296 	.owner		=	THIS_MODULE,
297 };
298 
299 struct Qdisc noop_qdisc = {
300 	.enqueue	=	noop_enqueue,
301 	.dequeue	=	noop_dequeue,
302 	.flags		=	TCQ_F_BUILTIN,
303 	.ops		=	&noop_qdisc_ops,
304 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
305 };
306 
307 static struct Qdisc_ops noqueue_qdisc_ops = {
308 	.id		=	"noqueue",
309 	.priv_size	=	0,
310 	.enqueue	=	noop_enqueue,
311 	.dequeue	=	noop_dequeue,
312 	.requeue	=	noop_requeue,
313 	.owner		=	THIS_MODULE,
314 };
315 
316 static struct Qdisc noqueue_qdisc = {
317 	.enqueue	=	NULL,
318 	.dequeue	=	noop_dequeue,
319 	.flags		=	TCQ_F_BUILTIN,
320 	.ops		=	&noqueue_qdisc_ops,
321 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
322 };
323 
324 
325 static const u8 prio2band[TC_PRIO_MAX+1] =
326 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
327 
328 /* 3-band FIFO queue: old style, but should be a bit faster than
329    generic prio+fifo combination.
330  */
331 
332 #define PFIFO_FAST_BANDS 3
333 
334 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
335 					     struct Qdisc *qdisc)
336 {
337 	struct sk_buff_head *list = qdisc_priv(qdisc);
338 	return list + prio2band[skb->priority & TC_PRIO_MAX];
339 }
340 
341 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
342 {
343 	struct sk_buff_head *list = prio2list(skb, qdisc);
344 
345 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
346 		qdisc->q.qlen++;
347 		return __qdisc_enqueue_tail(skb, qdisc, list);
348 	}
349 
350 	return qdisc_drop(skb, qdisc);
351 }
352 
353 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
354 {
355 	int prio;
356 	struct sk_buff_head *list = qdisc_priv(qdisc);
357 
358 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
359 		if (!skb_queue_empty(list + prio)) {
360 			qdisc->q.qlen--;
361 			return __qdisc_dequeue_head(qdisc, list + prio);
362 		}
363 	}
364 
365 	return NULL;
366 }
367 
368 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
369 {
370 	qdisc->q.qlen++;
371 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
372 }
373 
374 static void pfifo_fast_reset(struct Qdisc* qdisc)
375 {
376 	int prio;
377 	struct sk_buff_head *list = qdisc_priv(qdisc);
378 
379 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
380 		__qdisc_reset_queue(qdisc, list + prio);
381 
382 	qdisc->qstats.backlog = 0;
383 	qdisc->q.qlen = 0;
384 }
385 
386 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
387 {
388 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
389 
390 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
391 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
392 	return skb->len;
393 
394 rtattr_failure:
395 	return -1;
396 }
397 
398 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
399 {
400 	int prio;
401 	struct sk_buff_head *list = qdisc_priv(qdisc);
402 
403 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
404 		skb_queue_head_init(list + prio);
405 
406 	return 0;
407 }
408 
409 static struct Qdisc_ops pfifo_fast_ops = {
410 	.id		=	"pfifo_fast",
411 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
412 	.enqueue	=	pfifo_fast_enqueue,
413 	.dequeue	=	pfifo_fast_dequeue,
414 	.requeue	=	pfifo_fast_requeue,
415 	.init		=	pfifo_fast_init,
416 	.reset		=	pfifo_fast_reset,
417 	.dump		=	pfifo_fast_dump,
418 	.owner		=	THIS_MODULE,
419 };
420 
421 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
422 {
423 	void *p;
424 	struct Qdisc *sch;
425 	unsigned int size;
426 	int err = -ENOBUFS;
427 
428 	/* ensure that the Qdisc and the private data are 32-byte aligned */
429 	size = QDISC_ALIGN(sizeof(*sch));
430 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
431 
432 	p = kzalloc(size, GFP_KERNEL);
433 	if (!p)
434 		goto errout;
435 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
436 	sch->padded = (char *) sch - (char *) p;
437 
438 	INIT_LIST_HEAD(&sch->list);
439 	skb_queue_head_init(&sch->q);
440 	sch->ops = ops;
441 	sch->enqueue = ops->enqueue;
442 	sch->dequeue = ops->dequeue;
443 	sch->dev = dev;
444 	dev_hold(dev);
445 	sch->stats_lock = &dev->queue_lock;
446 	atomic_set(&sch->refcnt, 1);
447 
448 	return sch;
449 errout:
450 	return ERR_PTR(-err);
451 }
452 
453 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
454 				 unsigned int parentid)
455 {
456 	struct Qdisc *sch;
457 
458 	sch = qdisc_alloc(dev, ops);
459 	if (IS_ERR(sch))
460 		goto errout;
461 	sch->parent = parentid;
462 
463 	if (!ops->init || ops->init(sch, NULL) == 0)
464 		return sch;
465 
466 	qdisc_destroy(sch);
467 errout:
468 	return NULL;
469 }
470 
471 /* Under dev->queue_lock and BH! */
472 
473 void qdisc_reset(struct Qdisc *qdisc)
474 {
475 	struct Qdisc_ops *ops = qdisc->ops;
476 
477 	if (ops->reset)
478 		ops->reset(qdisc);
479 }
480 
481 /* this is the rcu callback function to clean up a qdisc when there
482  * are no further references to it */
483 
484 static void __qdisc_destroy(struct rcu_head *head)
485 {
486 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
487 	kfree((char *) qdisc - qdisc->padded);
488 }
489 
490 /* Under dev->queue_lock and BH! */
491 
492 void qdisc_destroy(struct Qdisc *qdisc)
493 {
494 	struct Qdisc_ops  *ops = qdisc->ops;
495 
496 	if (qdisc->flags & TCQ_F_BUILTIN ||
497 	    !atomic_dec_and_test(&qdisc->refcnt))
498 		return;
499 
500 	list_del(&qdisc->list);
501 #ifdef CONFIG_NET_ESTIMATOR
502 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
503 #endif
504 	if (ops->reset)
505 		ops->reset(qdisc);
506 	if (ops->destroy)
507 		ops->destroy(qdisc);
508 
509 	module_put(ops->owner);
510 	dev_put(qdisc->dev);
511 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
512 }
513 
514 void dev_activate(struct net_device *dev)
515 {
516 	/* No queueing discipline is attached to device;
517 	   create default one i.e. pfifo_fast for devices,
518 	   which need queueing and noqueue_qdisc for
519 	   virtual interfaces
520 	 */
521 
522 	if (dev->qdisc_sleeping == &noop_qdisc) {
523 		struct Qdisc *qdisc;
524 		if (dev->tx_queue_len) {
525 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
526 						  TC_H_ROOT);
527 			if (qdisc == NULL) {
528 				printk(KERN_INFO "%s: activation failed\n", dev->name);
529 				return;
530 			}
531 			write_lock(&qdisc_tree_lock);
532 			list_add_tail(&qdisc->list, &dev->qdisc_list);
533 			write_unlock(&qdisc_tree_lock);
534 		} else {
535 			qdisc =  &noqueue_qdisc;
536 		}
537 		write_lock(&qdisc_tree_lock);
538 		dev->qdisc_sleeping = qdisc;
539 		write_unlock(&qdisc_tree_lock);
540 	}
541 
542 	if (!netif_carrier_ok(dev))
543 		/* Delay activation until next carrier-on event */
544 		return;
545 
546 	spin_lock_bh(&dev->queue_lock);
547 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
548 	if (dev->qdisc != &noqueue_qdisc) {
549 		dev->trans_start = jiffies;
550 		dev_watchdog_up(dev);
551 	}
552 	spin_unlock_bh(&dev->queue_lock);
553 }
554 
555 void dev_deactivate(struct net_device *dev)
556 {
557 	struct Qdisc *qdisc;
558 
559 	spin_lock_bh(&dev->queue_lock);
560 	qdisc = dev->qdisc;
561 	dev->qdisc = &noop_qdisc;
562 
563 	qdisc_reset(qdisc);
564 
565 	spin_unlock_bh(&dev->queue_lock);
566 
567 	dev_watchdog_down(dev);
568 
569 	/* Wait for outstanding dev_queue_xmit calls. */
570 	synchronize_rcu();
571 
572 	/* Wait for outstanding qdisc_run calls. */
573 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
574 		yield();
575 
576 	if (dev->gso_skb) {
577 		kfree_skb(dev->gso_skb);
578 		dev->gso_skb = NULL;
579 	}
580 }
581 
582 void dev_init_scheduler(struct net_device *dev)
583 {
584 	qdisc_lock_tree(dev);
585 	dev->qdisc = &noop_qdisc;
586 	dev->qdisc_sleeping = &noop_qdisc;
587 	INIT_LIST_HEAD(&dev->qdisc_list);
588 	qdisc_unlock_tree(dev);
589 
590 	dev_watchdog_init(dev);
591 }
592 
593 void dev_shutdown(struct net_device *dev)
594 {
595 	struct Qdisc *qdisc;
596 
597 	qdisc_lock_tree(dev);
598 	qdisc = dev->qdisc_sleeping;
599 	dev->qdisc = &noop_qdisc;
600 	dev->qdisc_sleeping = &noop_qdisc;
601 	qdisc_destroy(qdisc);
602 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
603         if ((qdisc = dev->qdisc_ingress) != NULL) {
604 		dev->qdisc_ingress = NULL;
605 		qdisc_destroy(qdisc);
606         }
607 #endif
608 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
609 	qdisc_unlock_tree(dev);
610 }
611 
612 EXPORT_SYMBOL(__netdev_watchdog_up);
613 EXPORT_SYMBOL(netif_carrier_on);
614 EXPORT_SYMBOL(netif_carrier_off);
615 EXPORT_SYMBOL(noop_qdisc);
616 EXPORT_SYMBOL(noop_qdisc_ops);
617 EXPORT_SYMBOL(qdisc_create_dflt);
618 EXPORT_SYMBOL(qdisc_alloc);
619 EXPORT_SYMBOL(qdisc_destroy);
620 EXPORT_SYMBOL(qdisc_reset);
621 EXPORT_SYMBOL(qdisc_lock_tree);
622 EXPORT_SYMBOL(qdisc_unlock_tree);
623