xref: /openbmc/linux/net/sched/sch_generic.c (revision 0da974f4)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 /* Main transmission queue. */
38 
39 /* Main qdisc structure lock.
40 
41    However, modifications
42    to data, participating in scheduling must be additionally
43    protected with dev->queue_lock spinlock.
44 
45    The idea is the following:
46    - enqueue, dequeue are serialized via top level device
47      spinlock dev->queue_lock.
48    - tree walking is protected by read_lock_bh(qdisc_tree_lock)
49      and this lock is used only in process context.
50    - updates to tree are made under rtnl semaphore or
51      from softirq context (__qdisc_destroy rcu-callback)
52      hence this lock needs local bh disabling.
53 
54    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
55  */
56 DEFINE_RWLOCK(qdisc_tree_lock);
57 
58 void qdisc_lock_tree(struct net_device *dev)
59 {
60 	write_lock_bh(&qdisc_tree_lock);
61 	spin_lock_bh(&dev->queue_lock);
62 }
63 
64 void qdisc_unlock_tree(struct net_device *dev)
65 {
66 	spin_unlock_bh(&dev->queue_lock);
67 	write_unlock_bh(&qdisc_tree_lock);
68 }
69 
70 /*
71    dev->queue_lock serializes queue accesses for this device
72    AND dev->qdisc pointer itself.
73 
74    netif_tx_lock serializes accesses to device driver.
75 
76    dev->queue_lock and netif_tx_lock are mutually exclusive,
77    if one is grabbed, another must be free.
78  */
79 
80 
81 /* Kick device.
82    Note, that this procedure can be called by a watchdog timer, so that
83    we do not check dev->tbusy flag here.
84 
85    Returns:  0  - queue is empty.
86             >0  - queue is not empty, but throttled.
87 	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.
88 
89    NOTE: Called under dev->queue_lock with locally disabled BH.
90 */
91 
92 static inline int qdisc_restart(struct net_device *dev)
93 {
94 	struct Qdisc *q = dev->qdisc;
95 	struct sk_buff *skb;
96 
97 	/* Dequeue packet */
98 	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
99 		unsigned nolock = (dev->features & NETIF_F_LLTX);
100 
101 		dev->gso_skb = NULL;
102 
103 		/*
104 		 * When the driver has LLTX set it does its own locking
105 		 * in start_xmit. No need to add additional overhead by
106 		 * locking again. These checks are worth it because
107 		 * even uncongested locks can be quite expensive.
108 		 * The driver can do trylock like here too, in case
109 		 * of lock congestion it should return -1 and the packet
110 		 * will be requeued.
111 		 */
112 		if (!nolock) {
113 			if (!netif_tx_trylock(dev)) {
114 			collision:
115 				/* So, someone grabbed the driver. */
116 
117 				/* It may be transient configuration error,
118 				   when hard_start_xmit() recurses. We detect
119 				   it by checking xmit owner and drop the
120 				   packet when deadloop is detected.
121 				*/
122 				if (dev->xmit_lock_owner == smp_processor_id()) {
123 					kfree_skb(skb);
124 					if (net_ratelimit())
125 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
126 					return -1;
127 				}
128 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
129 				goto requeue;
130 			}
131 		}
132 
133 		{
134 			/* And release queue */
135 			spin_unlock(&dev->queue_lock);
136 
137 			if (!netif_queue_stopped(dev)) {
138 				int ret;
139 
140 				ret = dev_hard_start_xmit(skb, dev);
141 				if (ret == NETDEV_TX_OK) {
142 					if (!nolock) {
143 						netif_tx_unlock(dev);
144 					}
145 					spin_lock(&dev->queue_lock);
146 					return -1;
147 				}
148 				if (ret == NETDEV_TX_LOCKED && nolock) {
149 					spin_lock(&dev->queue_lock);
150 					goto collision;
151 				}
152 			}
153 
154 			/* NETDEV_TX_BUSY - we need to requeue */
155 			/* Release the driver */
156 			if (!nolock) {
157 				netif_tx_unlock(dev);
158 			}
159 			spin_lock(&dev->queue_lock);
160 			q = dev->qdisc;
161 		}
162 
163 		/* Device kicked us out :(
164 		   This is possible in three cases:
165 
166 		   0. driver is locked
167 		   1. fastroute is enabled
168 		   2. device cannot determine busy state
169 		      before start of transmission (f.e. dialout)
170 		   3. device is buggy (ppp)
171 		 */
172 
173 requeue:
174 		if (skb->next)
175 			dev->gso_skb = skb;
176 		else
177 			q->ops->requeue(skb, q);
178 		netif_schedule(dev);
179 		return 1;
180 	}
181 	BUG_ON((int) q->q.qlen < 0);
182 	return q->q.qlen;
183 }
184 
185 void __qdisc_run(struct net_device *dev)
186 {
187 	if (unlikely(dev->qdisc == &noop_qdisc))
188 		goto out;
189 
190 	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
191 		/* NOTHING */;
192 
193 out:
194 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
195 }
196 
197 static void dev_watchdog(unsigned long arg)
198 {
199 	struct net_device *dev = (struct net_device *)arg;
200 
201 	netif_tx_lock(dev);
202 	if (dev->qdisc != &noop_qdisc) {
203 		if (netif_device_present(dev) &&
204 		    netif_running(dev) &&
205 		    netif_carrier_ok(dev)) {
206 			if (netif_queue_stopped(dev) &&
207 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
208 
209 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
210 				       dev->name);
211 				dev->tx_timeout(dev);
212 			}
213 			if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
214 				dev_hold(dev);
215 		}
216 	}
217 	netif_tx_unlock(dev);
218 
219 	dev_put(dev);
220 }
221 
222 static void dev_watchdog_init(struct net_device *dev)
223 {
224 	init_timer(&dev->watchdog_timer);
225 	dev->watchdog_timer.data = (unsigned long)dev;
226 	dev->watchdog_timer.function = dev_watchdog;
227 }
228 
229 void __netdev_watchdog_up(struct net_device *dev)
230 {
231 	if (dev->tx_timeout) {
232 		if (dev->watchdog_timeo <= 0)
233 			dev->watchdog_timeo = 5*HZ;
234 		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
235 			dev_hold(dev);
236 	}
237 }
238 
239 static void dev_watchdog_up(struct net_device *dev)
240 {
241 	netif_tx_lock_bh(dev);
242 	__netdev_watchdog_up(dev);
243 	netif_tx_unlock_bh(dev);
244 }
245 
246 static void dev_watchdog_down(struct net_device *dev)
247 {
248 	netif_tx_lock_bh(dev);
249 	if (del_timer(&dev->watchdog_timer))
250 		dev_put(dev);
251 	netif_tx_unlock_bh(dev);
252 }
253 
254 void netif_carrier_on(struct net_device *dev)
255 {
256 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
257 		linkwatch_fire_event(dev);
258 	if (netif_running(dev))
259 		__netdev_watchdog_up(dev);
260 }
261 
262 void netif_carrier_off(struct net_device *dev)
263 {
264 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
265 		linkwatch_fire_event(dev);
266 }
267 
268 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
269    under all circumstances. It is difficult to invent anything faster or
270    cheaper.
271  */
272 
273 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
274 {
275 	kfree_skb(skb);
276 	return NET_XMIT_CN;
277 }
278 
279 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
280 {
281 	return NULL;
282 }
283 
284 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
285 {
286 	if (net_ratelimit())
287 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
288 		       skb->dev->name);
289 	kfree_skb(skb);
290 	return NET_XMIT_CN;
291 }
292 
293 struct Qdisc_ops noop_qdisc_ops = {
294 	.id		=	"noop",
295 	.priv_size	=	0,
296 	.enqueue	=	noop_enqueue,
297 	.dequeue	=	noop_dequeue,
298 	.requeue	=	noop_requeue,
299 	.owner		=	THIS_MODULE,
300 };
301 
302 struct Qdisc noop_qdisc = {
303 	.enqueue	=	noop_enqueue,
304 	.dequeue	=	noop_dequeue,
305 	.flags		=	TCQ_F_BUILTIN,
306 	.ops		=	&noop_qdisc_ops,
307 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
308 };
309 
310 static struct Qdisc_ops noqueue_qdisc_ops = {
311 	.id		=	"noqueue",
312 	.priv_size	=	0,
313 	.enqueue	=	noop_enqueue,
314 	.dequeue	=	noop_dequeue,
315 	.requeue	=	noop_requeue,
316 	.owner		=	THIS_MODULE,
317 };
318 
319 static struct Qdisc noqueue_qdisc = {
320 	.enqueue	=	NULL,
321 	.dequeue	=	noop_dequeue,
322 	.flags		=	TCQ_F_BUILTIN,
323 	.ops		=	&noqueue_qdisc_ops,
324 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
325 };
326 
327 
328 static const u8 prio2band[TC_PRIO_MAX+1] =
329 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
330 
331 /* 3-band FIFO queue: old style, but should be a bit faster than
332    generic prio+fifo combination.
333  */
334 
335 #define PFIFO_FAST_BANDS 3
336 
337 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
338 					     struct Qdisc *qdisc)
339 {
340 	struct sk_buff_head *list = qdisc_priv(qdisc);
341 	return list + prio2band[skb->priority & TC_PRIO_MAX];
342 }
343 
344 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
345 {
346 	struct sk_buff_head *list = prio2list(skb, qdisc);
347 
348 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
349 		qdisc->q.qlen++;
350 		return __qdisc_enqueue_tail(skb, qdisc, list);
351 	}
352 
353 	return qdisc_drop(skb, qdisc);
354 }
355 
356 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
357 {
358 	int prio;
359 	struct sk_buff_head *list = qdisc_priv(qdisc);
360 
361 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
362 		if (!skb_queue_empty(list + prio)) {
363 			qdisc->q.qlen--;
364 			return __qdisc_dequeue_head(qdisc, list + prio);
365 		}
366 	}
367 
368 	return NULL;
369 }
370 
371 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
372 {
373 	qdisc->q.qlen++;
374 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
375 }
376 
377 static void pfifo_fast_reset(struct Qdisc* qdisc)
378 {
379 	int prio;
380 	struct sk_buff_head *list = qdisc_priv(qdisc);
381 
382 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
383 		__qdisc_reset_queue(qdisc, list + prio);
384 
385 	qdisc->qstats.backlog = 0;
386 	qdisc->q.qlen = 0;
387 }
388 
389 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
390 {
391 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
392 
393 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
394 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
395 	return skb->len;
396 
397 rtattr_failure:
398 	return -1;
399 }
400 
401 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
402 {
403 	int prio;
404 	struct sk_buff_head *list = qdisc_priv(qdisc);
405 
406 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
407 		skb_queue_head_init(list + prio);
408 
409 	return 0;
410 }
411 
412 static struct Qdisc_ops pfifo_fast_ops = {
413 	.id		=	"pfifo_fast",
414 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
415 	.enqueue	=	pfifo_fast_enqueue,
416 	.dequeue	=	pfifo_fast_dequeue,
417 	.requeue	=	pfifo_fast_requeue,
418 	.init		=	pfifo_fast_init,
419 	.reset		=	pfifo_fast_reset,
420 	.dump		=	pfifo_fast_dump,
421 	.owner		=	THIS_MODULE,
422 };
423 
424 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
425 {
426 	void *p;
427 	struct Qdisc *sch;
428 	unsigned int size;
429 	int err = -ENOBUFS;
430 
431 	/* ensure that the Qdisc and the private data are 32-byte aligned */
432 	size = QDISC_ALIGN(sizeof(*sch));
433 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
434 
435 	p = kzalloc(size, GFP_KERNEL);
436 	if (!p)
437 		goto errout;
438 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
439 	sch->padded = (char *) sch - (char *) p;
440 
441 	INIT_LIST_HEAD(&sch->list);
442 	skb_queue_head_init(&sch->q);
443 	sch->ops = ops;
444 	sch->enqueue = ops->enqueue;
445 	sch->dequeue = ops->dequeue;
446 	sch->dev = dev;
447 	dev_hold(dev);
448 	sch->stats_lock = &dev->queue_lock;
449 	atomic_set(&sch->refcnt, 1);
450 
451 	return sch;
452 errout:
453 	return ERR_PTR(-err);
454 }
455 
456 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
457 {
458 	struct Qdisc *sch;
459 
460 	sch = qdisc_alloc(dev, ops);
461 	if (IS_ERR(sch))
462 		goto errout;
463 
464 	if (!ops->init || ops->init(sch, NULL) == 0)
465 		return sch;
466 
467 	qdisc_destroy(sch);
468 errout:
469 	return NULL;
470 }
471 
472 /* Under dev->queue_lock and BH! */
473 
474 void qdisc_reset(struct Qdisc *qdisc)
475 {
476 	struct Qdisc_ops *ops = qdisc->ops;
477 
478 	if (ops->reset)
479 		ops->reset(qdisc);
480 }
481 
482 /* this is the rcu callback function to clean up a qdisc when there
483  * are no further references to it */
484 
485 static void __qdisc_destroy(struct rcu_head *head)
486 {
487 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
488 	struct Qdisc_ops  *ops = qdisc->ops;
489 
490 #ifdef CONFIG_NET_ESTIMATOR
491 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
492 #endif
493 	write_lock(&qdisc_tree_lock);
494 	if (ops->reset)
495 		ops->reset(qdisc);
496 	if (ops->destroy)
497 		ops->destroy(qdisc);
498 	write_unlock(&qdisc_tree_lock);
499 	module_put(ops->owner);
500 
501 	dev_put(qdisc->dev);
502 	kfree((char *) qdisc - qdisc->padded);
503 }
504 
505 /* Under dev->queue_lock and BH! */
506 
507 void qdisc_destroy(struct Qdisc *qdisc)
508 {
509 	struct list_head cql = LIST_HEAD_INIT(cql);
510 	struct Qdisc *cq, *q, *n;
511 
512 	if (qdisc->flags & TCQ_F_BUILTIN ||
513 		!atomic_dec_and_test(&qdisc->refcnt))
514 		return;
515 
516 	if (!list_empty(&qdisc->list)) {
517 		if (qdisc->ops->cl_ops == NULL)
518 			list_del(&qdisc->list);
519 		else
520 			list_move(&qdisc->list, &cql);
521 	}
522 
523 	/* unlink inner qdiscs from dev->qdisc_list immediately */
524 	list_for_each_entry(cq, &cql, list)
525 		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
526 			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
527 				if (q->ops->cl_ops == NULL)
528 					list_del_init(&q->list);
529 				else
530 					list_move_tail(&q->list, &cql);
531 			}
532 	list_for_each_entry_safe(cq, n, &cql, list)
533 		list_del_init(&cq->list);
534 
535 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
536 }
537 
538 void dev_activate(struct net_device *dev)
539 {
540 	/* No queueing discipline is attached to device;
541 	   create default one i.e. pfifo_fast for devices,
542 	   which need queueing and noqueue_qdisc for
543 	   virtual interfaces
544 	 */
545 
546 	if (dev->qdisc_sleeping == &noop_qdisc) {
547 		struct Qdisc *qdisc;
548 		if (dev->tx_queue_len) {
549 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops);
550 			if (qdisc == NULL) {
551 				printk(KERN_INFO "%s: activation failed\n", dev->name);
552 				return;
553 			}
554 			write_lock_bh(&qdisc_tree_lock);
555 			list_add_tail(&qdisc->list, &dev->qdisc_list);
556 			write_unlock_bh(&qdisc_tree_lock);
557 		} else {
558 			qdisc =  &noqueue_qdisc;
559 		}
560 		write_lock_bh(&qdisc_tree_lock);
561 		dev->qdisc_sleeping = qdisc;
562 		write_unlock_bh(&qdisc_tree_lock);
563 	}
564 
565 	if (!netif_carrier_ok(dev))
566 		/* Delay activation until next carrier-on event */
567 		return;
568 
569 	spin_lock_bh(&dev->queue_lock);
570 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
571 	if (dev->qdisc != &noqueue_qdisc) {
572 		dev->trans_start = jiffies;
573 		dev_watchdog_up(dev);
574 	}
575 	spin_unlock_bh(&dev->queue_lock);
576 }
577 
578 void dev_deactivate(struct net_device *dev)
579 {
580 	struct Qdisc *qdisc;
581 
582 	spin_lock_bh(&dev->queue_lock);
583 	qdisc = dev->qdisc;
584 	dev->qdisc = &noop_qdisc;
585 
586 	qdisc_reset(qdisc);
587 
588 	spin_unlock_bh(&dev->queue_lock);
589 
590 	dev_watchdog_down(dev);
591 
592 	/* Wait for outstanding dev_queue_xmit calls. */
593 	synchronize_rcu();
594 
595 	/* Wait for outstanding qdisc_run calls. */
596 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
597 		yield();
598 
599 	if (dev->gso_skb) {
600 		kfree_skb(dev->gso_skb);
601 		dev->gso_skb = NULL;
602 	}
603 }
604 
605 void dev_init_scheduler(struct net_device *dev)
606 {
607 	qdisc_lock_tree(dev);
608 	dev->qdisc = &noop_qdisc;
609 	dev->qdisc_sleeping = &noop_qdisc;
610 	INIT_LIST_HEAD(&dev->qdisc_list);
611 	qdisc_unlock_tree(dev);
612 
613 	dev_watchdog_init(dev);
614 }
615 
616 void dev_shutdown(struct net_device *dev)
617 {
618 	struct Qdisc *qdisc;
619 
620 	qdisc_lock_tree(dev);
621 	qdisc = dev->qdisc_sleeping;
622 	dev->qdisc = &noop_qdisc;
623 	dev->qdisc_sleeping = &noop_qdisc;
624 	qdisc_destroy(qdisc);
625 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
626         if ((qdisc = dev->qdisc_ingress) != NULL) {
627 		dev->qdisc_ingress = NULL;
628 		qdisc_destroy(qdisc);
629         }
630 #endif
631 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
632 	qdisc_unlock_tree(dev);
633 }
634 
635 EXPORT_SYMBOL(__netdev_watchdog_up);
636 EXPORT_SYMBOL(netif_carrier_on);
637 EXPORT_SYMBOL(netif_carrier_off);
638 EXPORT_SYMBOL(noop_qdisc);
639 EXPORT_SYMBOL(noop_qdisc_ops);
640 EXPORT_SYMBOL(qdisc_create_dflt);
641 EXPORT_SYMBOL(qdisc_alloc);
642 EXPORT_SYMBOL(qdisc_destroy);
643 EXPORT_SYMBOL(qdisc_reset);
644 EXPORT_SYMBOL(qdisc_lock_tree);
645 EXPORT_SYMBOL(qdisc_unlock_tree);
646