xref: /openbmc/linux/net/sched/sch_generic.c (revision 85670cc1)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Main qdisc structure lock.
40 
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44 
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made only under rtnl semaphore,
51      hence this lock may be made without local bh disabling.
52 
53    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
54  */
55 DEFINE_RWLOCK(qdisc_tree_lock);
56 
57 void qdisc_lock_tree(struct net_device *dev)
58 {
59 	write_lock(&qdisc_tree_lock);
60 	spin_lock_bh(&dev->queue_lock);
61 }
62 
63 void qdisc_unlock_tree(struct net_device *dev)
64 {
65 	spin_unlock_bh(&dev->queue_lock);
66 	write_unlock(&qdisc_tree_lock);
67 }
68 
69 /*
70    dev->queue_lock serializes queue accesses for this device
71    AND dev->qdisc pointer itself.
72 
73    netif_tx_lock serializes accesses to device driver.
74 
75    dev->queue_lock and netif_tx_lock are mutually exclusive,
76    if one is grabbed, another must be free.
77  */
78 
79 
80 /* Kick device.
81    Note, that this procedure can be called by a watchdog timer, so that
82    we do not check dev->tbusy flag here.
83 
84    Returns:  0  - queue is empty.
85             >0  - queue is not empty, but throttled.
86 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
87 
88    NOTE: Called under dev->queue_lock with locally disabled BH.
89 */
90 
91 static inline int qdisc_restart(struct net_device *dev)
92 {
93 	struct Qdisc *q = dev->qdisc;
94 	struct sk_buff *skb;
95 
96 	/* Dequeue packet */
97 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
98 		unsigned nolock = (dev->features & NETIF_F_LLTX);
99 
100 		dev->gso_skb = NULL;
101 
102 		/*
103 		 * When the driver has LLTX set it does its own locking
104 		 * in start_xmit. No need to add additional overhead by
105 		 * locking again. These checks are worth it because
106 		 * even uncongested locks can be quite expensive.
107 		 * The driver can do trylock like here too, in case
108 		 * of lock congestion it should return -1 and the packet
109 		 * will be requeued.
110 		 */
111 		if (!nolock) {
112 			if (!netif_tx_trylock(dev)) {
113 			collision:
114 				/* So, someone grabbed the driver. */
115 
116 				/* It may be transient configuration error,
117 				   when hard_start_xmit() recurses. We detect
118 				   it by checking xmit owner and drop the
119 				   packet when deadloop is detected.
120 				*/
121 				if (dev->xmit_lock_owner == smp_processor_id()) {
122 					kfree_skb(skb);
123 					if (net_ratelimit())
124 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
125 					return -1;
126 				}
127 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
128 				goto requeue;
129 			}
130 		}
131 
132 		{
133 			/* And release queue */
134 			spin_unlock(&dev->queue_lock);
135 
136 			if (!netif_queue_stopped(dev)) {
137 				int ret;
138 
139 				ret = dev_hard_start_xmit(skb, dev);
140 				if (ret == NETDEV_TX_OK) {
141 					if (!nolock) {
142 						netif_tx_unlock(dev);
143 					}
144 					spin_lock(&dev->queue_lock);
145 					return -1;
146 				}
147 				if (ret == NETDEV_TX_LOCKED && nolock) {
148 					spin_lock(&dev->queue_lock);
149 					goto collision;
150 				}
151 			}
152 
153 			/* NETDEV_TX_BUSY - we need to requeue */
154 			/* Release the driver */
155 			if (!nolock) {
156 				netif_tx_unlock(dev);
157 			}
158 			spin_lock(&dev->queue_lock);
159 			q = dev->qdisc;
160 		}
161 
162 		/* Device kicked us out :(
163 		   This is possible in three cases:
164 
165 		   0. driver is locked
166 		   1. fastroute is enabled
167 		   2. device cannot determine busy state
168 		      before start of transmission (f.e. dialout)
169 		   3. device is buggy (ppp)
170 		 */
171 
172 requeue:
173 		if (skb->next)
174 			dev->gso_skb = skb;
175 		else
176 			q->ops->requeue(skb, q);
177 		netif_schedule(dev);
178 		return 1;
179 	}
180 	BUG_ON((int) q->q.qlen < 0);
181 	return q->q.qlen;
182 }
183 
184 void __qdisc_run(struct net_device *dev)
185 {
186 	if (unlikely(dev->qdisc == &noop_qdisc))
187 		goto out;
188 
189 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
190 		/* NOTHING */;
191 
192 out:
193 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
194 }
195 
196 static void dev_watchdog(unsigned long arg)
197 {
198 	struct net_device *dev = (struct net_device *)arg;
199 
200 	netif_tx_lock(dev);
201 	if (dev->qdisc != &noop_qdisc) {
202 		if (netif_device_present(dev) &&
203 		    netif_running(dev) &&
204 		    netif_carrier_ok(dev)) {
205 			if (netif_queue_stopped(dev) &&
206 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
207 
208 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
209 				       dev->name);
210 				dev->tx_timeout(dev);
211 			}
212 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
213 				dev_hold(dev);
214 		}
215 	}
216 	netif_tx_unlock(dev);
217 
218 	dev_put(dev);
219 }
220 
221 static void dev_watchdog_init(struct net_device *dev)
222 {
223 	init_timer(&dev->watchdog_timer);
224 	dev->watchdog_timer.data = (unsigned long)dev;
225 	dev->watchdog_timer.function = dev_watchdog;
226 }
227 
228 void __netdev_watchdog_up(struct net_device *dev)
229 {
230 	if (dev->tx_timeout) {
231 		if (dev->watchdog_timeo <= 0)
232 			dev->watchdog_timeo = 5*HZ;
233 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
234 			dev_hold(dev);
235 	}
236 }
237 
238 static void dev_watchdog_up(struct net_device *dev)
239 {
240 	__netdev_watchdog_up(dev);
241 }
242 
243 static void dev_watchdog_down(struct net_device *dev)
244 {
245 	netif_tx_lock_bh(dev);
246 	if (del_timer(&dev->watchdog_timer))
247 		dev_put(dev);
248 	netif_tx_unlock_bh(dev);
249 }
250 
251 void netif_carrier_on(struct net_device *dev)
252 {
253 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
254 		linkwatch_fire_event(dev);
255 	if (netif_running(dev))
256 		__netdev_watchdog_up(dev);
257 }
258 
259 void netif_carrier_off(struct net_device *dev)
260 {
261 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
262 		linkwatch_fire_event(dev);
263 }
264 
265 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
266    under all circumstances. It is difficult to invent anything faster or
267    cheaper.
268  */
269 
270 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
271 {
272 	kfree_skb(skb);
273 	return NET_XMIT_CN;
274 }
275 
276 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
277 {
278 	return NULL;
279 }
280 
281 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
282 {
283 	if (net_ratelimit())
284 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
285 		       skb->dev->name);
286 	kfree_skb(skb);
287 	return NET_XMIT_CN;
288 }
289 
290 struct Qdisc_ops noop_qdisc_ops = {
291 	.id		=	"noop",
292 	.priv_size	=	0,
293 	.enqueue	=	noop_enqueue,
294 	.dequeue	=	noop_dequeue,
295 	.requeue	=	noop_requeue,
296 	.owner		=	THIS_MODULE,
297 };
298 
299 struct Qdisc noop_qdisc = {
300 	.enqueue	=	noop_enqueue,
301 	.dequeue	=	noop_dequeue,
302 	.flags		=	TCQ_F_BUILTIN,
303 	.ops		=	&noop_qdisc_ops,
304 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
305 };
306 
307 static struct Qdisc_ops noqueue_qdisc_ops = {
308 	.id		=	"noqueue",
309 	.priv_size	=	0,
310 	.enqueue	=	noop_enqueue,
311 	.dequeue	=	noop_dequeue,
312 	.requeue	=	noop_requeue,
313 	.owner		=	THIS_MODULE,
314 };
315 
316 static struct Qdisc noqueue_qdisc = {
317 	.enqueue	=	NULL,
318 	.dequeue	=	noop_dequeue,
319 	.flags		=	TCQ_F_BUILTIN,
320 	.ops		=	&noqueue_qdisc_ops,
321 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
322 };
323 
324 
325 static const u8 prio2band[TC_PRIO_MAX+1] =
326 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
327 
328 /* 3-band FIFO queue: old style, but should be a bit faster than
329    generic prio+fifo combination.
330  */
331 
332 #define PFIFO_FAST_BANDS 3
333 
334 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
335 					     struct Qdisc *qdisc)
336 {
337 	struct sk_buff_head *list = qdisc_priv(qdisc);
338 	return list + prio2band[skb->priority & TC_PRIO_MAX];
339 }
340 
341 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
342 {
343 	struct sk_buff_head *list = prio2list(skb, qdisc);
344 
345 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
346 		qdisc->q.qlen++;
347 		return __qdisc_enqueue_tail(skb, qdisc, list);
348 	}
349 
350 	return qdisc_drop(skb, qdisc);
351 }
352 
353 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
354 {
355 	int prio;
356 	struct sk_buff_head *list = qdisc_priv(qdisc);
357 
358 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
359 		if (!skb_queue_empty(list + prio)) {
360 			qdisc->q.qlen--;
361 			return __qdisc_dequeue_head(qdisc, list + prio);
362 		}
363 	}
364 
365 	return NULL;
366 }
367 
368 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
369 {
370 	qdisc->q.qlen++;
371 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
372 }
373 
374 static void pfifo_fast_reset(struct Qdisc* qdisc)
375 {
376 	int prio;
377 	struct sk_buff_head *list = qdisc_priv(qdisc);
378 
379 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
380 		__qdisc_reset_queue(qdisc, list + prio);
381 
382 	qdisc->qstats.backlog = 0;
383 	qdisc->q.qlen = 0;
384 }
385 
386 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
387 {
388 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
389 
390 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
391 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
392 	return skb->len;
393 
394 rtattr_failure:
395 	return -1;
396 }
397 
398 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
399 {
400 	int prio;
401 	struct sk_buff_head *list = qdisc_priv(qdisc);
402 
403 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
404 		skb_queue_head_init(list + prio);
405 
406 	return 0;
407 }
408 
409 static struct Qdisc_ops pfifo_fast_ops = {
410 	.id		=	"pfifo_fast",
411 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
412 	.enqueue	=	pfifo_fast_enqueue,
413 	.dequeue	=	pfifo_fast_dequeue,
414 	.requeue	=	pfifo_fast_requeue,
415 	.init		=	pfifo_fast_init,
416 	.reset		=	pfifo_fast_reset,
417 	.dump		=	pfifo_fast_dump,
418 	.owner		=	THIS_MODULE,
419 };
420 
421 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
422 {
423 	void *p;
424 	struct Qdisc *sch;
425 	unsigned int size;
426 	int err = -ENOBUFS;
427 
428 	/* ensure that the Qdisc and the private data are 32-byte aligned */
429 	size = QDISC_ALIGN(sizeof(*sch));
430 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
431 
432 	p = kzalloc(size, GFP_KERNEL);
433 	if (!p)
434 		goto errout;
435 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
436 	sch->padded = (char *) sch - (char *) p;
437 
438 	INIT_LIST_HEAD(&sch->list);
439 	skb_queue_head_init(&sch->q);
440 	sch->ops = ops;
441 	sch->enqueue = ops->enqueue;
442 	sch->dequeue = ops->dequeue;
443 	sch->dev = dev;
444 	dev_hold(dev);
445 	sch->stats_lock = &dev->queue_lock;
446 	atomic_set(&sch->refcnt, 1);
447 
448 	return sch;
449 errout:
450 	return ERR_PTR(-err);
451 }
452 
453 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
454 {
455 	struct Qdisc *sch;
456 
457 	sch = qdisc_alloc(dev, ops);
458 	if (IS_ERR(sch))
459 		goto errout;
460 
461 	if (!ops->init || ops->init(sch, NULL) == 0)
462 		return sch;
463 
464 	qdisc_destroy(sch);
465 errout:
466 	return NULL;
467 }
468 
469 /* Under dev->queue_lock and BH! */
470 
471 void qdisc_reset(struct Qdisc *qdisc)
472 {
473 	struct Qdisc_ops *ops = qdisc->ops;
474 
475 	if (ops->reset)
476 		ops->reset(qdisc);
477 }
478 
479 /* this is the rcu callback function to clean up a qdisc when there
480  * are no further references to it */
481 
482 static void __qdisc_destroy(struct rcu_head *head)
483 {
484 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
485 	kfree((char *) qdisc - qdisc->padded);
486 }
487 
488 /* Under dev->queue_lock and BH! */
489 
490 void qdisc_destroy(struct Qdisc *qdisc)
491 {
492 	struct Qdisc_ops  *ops = qdisc->ops;
493 
494 	if (qdisc->flags & TCQ_F_BUILTIN ||
495 	    !atomic_dec_and_test(&qdisc->refcnt))
496 		return;
497 
498 	list_del(&qdisc->list);
499 #ifdef CONFIG_NET_ESTIMATOR
500 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
501 #endif
502 	if (ops->reset)
503 		ops->reset(qdisc);
504 	if (ops->destroy)
505 		ops->destroy(qdisc);
506 
507 	module_put(ops->owner);
508 	dev_put(qdisc->dev);
509 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
510 }
511 
512 void dev_activate(struct net_device *dev)
513 {
514 	/* No queueing discipline is attached to device;
515 	   create default one i.e. pfifo_fast for devices,
516 	   which need queueing and noqueue_qdisc for
517 	   virtual interfaces
518 	 */
519 
520 	if (dev->qdisc_sleeping == &noop_qdisc) {
521 		struct Qdisc *qdisc;
522 		if (dev->tx_queue_len) {
523 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
524 			if (qdisc == NULL) {
525 				printk(KERN_INFO "%s: activation failed\n", dev->name);
526 				return;
527 			}
528 			write_lock(&qdisc_tree_lock);
529 			list_add_tail(&qdisc->list, &dev->qdisc_list);
530 			write_unlock(&qdisc_tree_lock);
531 		} else {
532 			qdisc =  &noqueue_qdisc;
533 		}
534 		write_lock(&qdisc_tree_lock);
535 		dev->qdisc_sleeping = qdisc;
536 		write_unlock(&qdisc_tree_lock);
537 	}
538 
539 	if (!netif_carrier_ok(dev))
540 		/* Delay activation until next carrier-on event */
541 		return;
542 
543 	spin_lock_bh(&dev->queue_lock);
544 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
545 	if (dev->qdisc != &noqueue_qdisc) {
546 		dev->trans_start = jiffies;
547 		dev_watchdog_up(dev);
548 	}
549 	spin_unlock_bh(&dev->queue_lock);
550 }
551 
552 void dev_deactivate(struct net_device *dev)
553 {
554 	struct Qdisc *qdisc;
555 
556 	spin_lock_bh(&dev->queue_lock);
557 	qdisc = dev->qdisc;
558 	dev->qdisc = &noop_qdisc;
559 
560 	qdisc_reset(qdisc);
561 
562 	spin_unlock_bh(&dev->queue_lock);
563 
564 	dev_watchdog_down(dev);
565 
566 	/* Wait for outstanding dev_queue_xmit calls. */
567 	synchronize_rcu();
568 
569 	/* Wait for outstanding qdisc_run calls. */
570 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
571 		yield();
572 
573 	if (dev->gso_skb) {
574 		kfree_skb(dev->gso_skb);
575 		dev->gso_skb = NULL;
576 	}
577 }
578 
579 void dev_init_scheduler(struct net_device *dev)
580 {
581 	qdisc_lock_tree(dev);
582 	dev->qdisc = &noop_qdisc;
583 	dev->qdisc_sleeping = &noop_qdisc;
584 	INIT_LIST_HEAD(&dev->qdisc_list);
585 	qdisc_unlock_tree(dev);
586 
587 	dev_watchdog_init(dev);
588 }
589 
590 void dev_shutdown(struct net_device *dev)
591 {
592 	struct Qdisc *qdisc;
593 
594 	qdisc_lock_tree(dev);
595 	qdisc = dev->qdisc_sleeping;
596 	dev->qdisc = &noop_qdisc;
597 	dev->qdisc_sleeping = &noop_qdisc;
598 	qdisc_destroy(qdisc);
599 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
600         if ((qdisc = dev->qdisc_ingress) != NULL) {
601 		dev->qdisc_ingress = NULL;
602 		qdisc_destroy(qdisc);
603         }
604 #endif
605 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
606 	qdisc_unlock_tree(dev);
607 }
608 
609 EXPORT_SYMBOL(__netdev_watchdog_up);
610 EXPORT_SYMBOL(netif_carrier_on);
611 EXPORT_SYMBOL(netif_carrier_off);
612 EXPORT_SYMBOL(noop_qdisc);
613 EXPORT_SYMBOL(noop_qdisc_ops);
614 EXPORT_SYMBOL(qdisc_create_dflt);
615 EXPORT_SYMBOL(qdisc_alloc);
616 EXPORT_SYMBOL(qdisc_destroy);
617 EXPORT_SYMBOL(qdisc_reset);
618 EXPORT_SYMBOL(qdisc_lock_tree);
619 EXPORT_SYMBOL(qdisc_unlock_tree);
620