xref: /openbmc/linux/net/sched/sch_generic.c (revision c716a81a)
1 /*
2  * net/sched/sch_generic.c	Generic packet scheduler routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11  *              - Ingress support
12  */
13 
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/module.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/netdevice.h>
29 #include <linux/skbuff.h>
30 #include <linux/rtnetlink.h>
31 #include <linux/init.h>
32 #include <linux/rcupdate.h>
33 #include <linux/list.h>
34 #include <net/sock.h>
35 #include <net/pkt_sched.h>
36 
37 #define SCHED_TX_DROP -2
38 #define SCHED_TX_QUEUE -3
39 
40 /* Main transmission queue. */
41 
42 /* Modifications to data participating in scheduling must be protected with
43  * dev->queue_lock spinlock.
44  *
45  * The idea is the following:
46  * - enqueue, dequeue are serialized via top level device
47  *   spinlock dev->queue_lock.
48  * - ingress filtering is serialized via top level device
49  *   spinlock dev->ingress_lock.
50  * - updates to tree and tree walking are only done under the rtnl mutex.
51  */
52 
53 void qdisc_lock_tree(struct net_device *dev)
54 {
55 	spin_lock_bh(&dev->queue_lock);
56 	spin_lock(&dev->ingress_lock);
57 }
58 
59 void qdisc_unlock_tree(struct net_device *dev)
60 {
61 	spin_unlock(&dev->ingress_lock);
62 	spin_unlock_bh(&dev->queue_lock);
63 }
64 
65 static inline int qdisc_qlen(struct Qdisc *q)
66 {
67 	BUG_ON((int) q->q.qlen < 0);
68 	return q->q.qlen;
69 }
70 
71 static inline int handle_dev_cpu_collision(struct net_device *dev)
72 {
73 	if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
74 		if (net_ratelimit())
75 			printk(KERN_WARNING
76 			       "Dead loop on netdevice %s, fix it urgently!\n",
77 			       dev->name);
78 		return SCHED_TX_DROP;
79 	}
80 	__get_cpu_var(netdev_rx_stat).cpu_collision++;
81 	return SCHED_TX_QUEUE;
82 }
83 
84 static inline int
85 do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
86 {
87 
88 	if (unlikely(skb->next))
89 		dev->gso_skb = skb;
90 	else
91 		q->ops->requeue(skb, q);
92 	/* XXX: Could netif_schedule fail? Or is the fact we are
93 	 * requeueing imply the hardware path is closed
94 	 * and even if we fail, some interupt will wake us
95 	 */
96 	netif_schedule(dev);
97 	return 0;
98 }
99 
100 static inline struct sk_buff *
101 try_get_tx_pkt(struct net_device *dev, struct Qdisc *q)
102 {
103 	struct sk_buff *skb = dev->gso_skb;
104 
105 	if (skb)
106 		dev->gso_skb = NULL;
107 	else
108 		skb = q->dequeue(q);
109 
110 	return skb;
111 }
112 
113 static inline int
114 tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
115 {
116 	int ret = handle_dev_cpu_collision(dev);
117 
118 	if (ret == SCHED_TX_DROP) {
119 		kfree_skb(skb);
120 		return qdisc_qlen(q);
121 	}
122 
123 	return do_dev_requeue(skb, dev, q);
124 }
125 
126 
127 /*
128    NOTE: Called under dev->queue_lock with locally disabled BH.
129 
130    __LINK_STATE_QDISC_RUNNING guarantees only one CPU
131    can enter this region at a time.
132 
133    dev->queue_lock serializes queue accesses for this device
134    AND dev->qdisc pointer itself.
135 
136    netif_tx_lock serializes accesses to device driver.
137 
138    dev->queue_lock and netif_tx_lock are mutually exclusive,
139    if one is grabbed, another must be free.
140 
141    Multiple CPUs may contend for the two locks.
142 
143    Note, that this procedure can be called by a watchdog timer
144 
145    Returns to the caller:
146    Returns:  0  - queue is empty or throttled.
147 	    >0  - queue is not empty.
148 
149 */
150 
151 static inline int qdisc_restart(struct net_device *dev)
152 {
153 	struct Qdisc *q = dev->qdisc;
154 	unsigned lockless = (dev->features & NETIF_F_LLTX);
155 	struct sk_buff *skb;
156 	int ret;
157 
158 	skb = try_get_tx_pkt(dev, q);
159 	if (skb == NULL)
160 		return 0;
161 
162 	/* we have a packet to send */
163 	if (!lockless) {
164 		if (!netif_tx_trylock(dev))
165 			return tx_islocked(skb, dev, q);
166 	}
167 	/* all clear .. */
168 	spin_unlock(&dev->queue_lock);
169 
170 	ret = NETDEV_TX_BUSY;
171 	if (!netif_queue_stopped(dev))
172 		/* churn baby churn .. */
173 		ret = dev_hard_start_xmit(skb, dev);
174 
175 	if (!lockless)
176 		netif_tx_unlock(dev);
177 
178 	spin_lock(&dev->queue_lock);
179 
180 	/* we need to refresh q because it may be invalid since
181 	 * we dropped  dev->queue_lock earlier ...
182 	 * So dont try to be clever grasshopper
183 	 */
184 	q = dev->qdisc;
185 	/* most likely result, packet went ok */
186 	if (ret == NETDEV_TX_OK)
187 		return qdisc_qlen(q);
188 	/* only for lockless drivers .. */
189 	if (ret == NETDEV_TX_LOCKED && lockless)
190 		return tx_islocked(skb, dev, q);
191 
192 	if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
193 		printk(KERN_WARNING " BUG %s code %d qlen %d\n",dev->name, ret, q->q.qlen);
194 
195 	return do_dev_requeue(skb, dev, q);
196 }
197 
198 
199 void __qdisc_run(struct net_device *dev)
200 {
201 	do {
202 		if (!qdisc_restart(dev))
203 			break;
204 	} while (!netif_queue_stopped(dev));
205 
206 	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
207 }
208 
209 static void dev_watchdog(unsigned long arg)
210 {
211 	struct net_device *dev = (struct net_device *)arg;
212 
213 	netif_tx_lock(dev);
214 	if (dev->qdisc != &noop_qdisc) {
215 		if (netif_device_present(dev) &&
216 		    netif_running(dev) &&
217 		    netif_carrier_ok(dev)) {
218 			if (netif_queue_stopped(dev) &&
219 			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
220 
221 				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
222 				       dev->name);
223 				dev->tx_timeout(dev);
224 			}
225 			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
226 				dev_hold(dev);
227 		}
228 	}
229 	netif_tx_unlock(dev);
230 
231 	dev_put(dev);
232 }
233 
234 static void dev_watchdog_init(struct net_device *dev)
235 {
236 	init_timer(&dev->watchdog_timer);
237 	dev->watchdog_timer.data = (unsigned long)dev;
238 	dev->watchdog_timer.function = dev_watchdog;
239 }
240 
241 void __netdev_watchdog_up(struct net_device *dev)
242 {
243 	if (dev->tx_timeout) {
244 		if (dev->watchdog_timeo <= 0)
245 			dev->watchdog_timeo = 5*HZ;
246 		if (!mod_timer(&dev->watchdog_timer,
247 			       round_jiffies(jiffies + dev->watchdog_timeo)))
248 			dev_hold(dev);
249 	}
250 }
251 
252 static void dev_watchdog_up(struct net_device *dev)
253 {
254 	__netdev_watchdog_up(dev);
255 }
256 
257 static void dev_watchdog_down(struct net_device *dev)
258 {
259 	netif_tx_lock_bh(dev);
260 	if (del_timer(&dev->watchdog_timer))
261 		dev_put(dev);
262 	netif_tx_unlock_bh(dev);
263 }
264 
265 void netif_carrier_on(struct net_device *dev)
266 {
267 	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
268 		linkwatch_fire_event(dev);
269 	if (netif_running(dev))
270 		__netdev_watchdog_up(dev);
271 }
272 
273 void netif_carrier_off(struct net_device *dev)
274 {
275 	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
276 		linkwatch_fire_event(dev);
277 }
278 
279 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
280    under all circumstances. It is difficult to invent anything faster or
281    cheaper.
282  */
283 
284 static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
285 {
286 	kfree_skb(skb);
287 	return NET_XMIT_CN;
288 }
289 
290 static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
291 {
292 	return NULL;
293 }
294 
295 static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
296 {
297 	if (net_ratelimit())
298 		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
299 		       skb->dev->name);
300 	kfree_skb(skb);
301 	return NET_XMIT_CN;
302 }
303 
304 struct Qdisc_ops noop_qdisc_ops = {
305 	.id		=	"noop",
306 	.priv_size	=	0,
307 	.enqueue	=	noop_enqueue,
308 	.dequeue	=	noop_dequeue,
309 	.requeue	=	noop_requeue,
310 	.owner		=	THIS_MODULE,
311 };
312 
313 struct Qdisc noop_qdisc = {
314 	.enqueue	=	noop_enqueue,
315 	.dequeue	=	noop_dequeue,
316 	.flags		=	TCQ_F_BUILTIN,
317 	.ops		=	&noop_qdisc_ops,
318 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
319 };
320 
321 static struct Qdisc_ops noqueue_qdisc_ops = {
322 	.id		=	"noqueue",
323 	.priv_size	=	0,
324 	.enqueue	=	noop_enqueue,
325 	.dequeue	=	noop_dequeue,
326 	.requeue	=	noop_requeue,
327 	.owner		=	THIS_MODULE,
328 };
329 
330 static struct Qdisc noqueue_qdisc = {
331 	.enqueue	=	NULL,
332 	.dequeue	=	noop_dequeue,
333 	.flags		=	TCQ_F_BUILTIN,
334 	.ops		=	&noqueue_qdisc_ops,
335 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
336 };
337 
338 
339 static const u8 prio2band[TC_PRIO_MAX+1] =
340 	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
341 
342 /* 3-band FIFO queue: old style, but should be a bit faster than
343    generic prio+fifo combination.
344  */
345 
346 #define PFIFO_FAST_BANDS 3
347 
348 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
349 					     struct Qdisc *qdisc)
350 {
351 	struct sk_buff_head *list = qdisc_priv(qdisc);
352 	return list + prio2band[skb->priority & TC_PRIO_MAX];
353 }
354 
355 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
356 {
357 	struct sk_buff_head *list = prio2list(skb, qdisc);
358 
359 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
360 		qdisc->q.qlen++;
361 		return __qdisc_enqueue_tail(skb, qdisc, list);
362 	}
363 
364 	return qdisc_drop(skb, qdisc);
365 }
366 
367 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
368 {
369 	int prio;
370 	struct sk_buff_head *list = qdisc_priv(qdisc);
371 
372 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
373 		if (!skb_queue_empty(list + prio)) {
374 			qdisc->q.qlen--;
375 			return __qdisc_dequeue_head(qdisc, list + prio);
376 		}
377 	}
378 
379 	return NULL;
380 }
381 
382 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
383 {
384 	qdisc->q.qlen++;
385 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
386 }
387 
388 static void pfifo_fast_reset(struct Qdisc* qdisc)
389 {
390 	int prio;
391 	struct sk_buff_head *list = qdisc_priv(qdisc);
392 
393 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
394 		__qdisc_reset_queue(qdisc, list + prio);
395 
396 	qdisc->qstats.backlog = 0;
397 	qdisc->q.qlen = 0;
398 }
399 
400 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
401 {
402 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
403 
404 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
405 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
406 	return skb->len;
407 
408 rtattr_failure:
409 	return -1;
410 }
411 
412 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
413 {
414 	int prio;
415 	struct sk_buff_head *list = qdisc_priv(qdisc);
416 
417 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
418 		skb_queue_head_init(list + prio);
419 
420 	return 0;
421 }
422 
423 static struct Qdisc_ops pfifo_fast_ops = {
424 	.id		=	"pfifo_fast",
425 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
426 	.enqueue	=	pfifo_fast_enqueue,
427 	.dequeue	=	pfifo_fast_dequeue,
428 	.requeue	=	pfifo_fast_requeue,
429 	.init		=	pfifo_fast_init,
430 	.reset		=	pfifo_fast_reset,
431 	.dump		=	pfifo_fast_dump,
432 	.owner		=	THIS_MODULE,
433 };
434 
435 struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
436 {
437 	void *p;
438 	struct Qdisc *sch;
439 	unsigned int size;
440 	int err = -ENOBUFS;
441 
442 	/* ensure that the Qdisc and the private data are 32-byte aligned */
443 	size = QDISC_ALIGN(sizeof(*sch));
444 	size += ops->priv_size + (QDISC_ALIGNTO - 1);
445 
446 	p = kzalloc(size, GFP_KERNEL);
447 	if (!p)
448 		goto errout;
449 	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
450 	sch->padded = (char *) sch - (char *) p;
451 
452 	INIT_LIST_HEAD(&sch->list);
453 	skb_queue_head_init(&sch->q);
454 	sch->ops = ops;
455 	sch->enqueue = ops->enqueue;
456 	sch->dequeue = ops->dequeue;
457 	sch->dev = dev;
458 	dev_hold(dev);
459 	atomic_set(&sch->refcnt, 1);
460 
461 	return sch;
462 errout:
463 	return ERR_PTR(-err);
464 }
465 
466 struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
467 				 unsigned int parentid)
468 {
469 	struct Qdisc *sch;
470 
471 	sch = qdisc_alloc(dev, ops);
472 	if (IS_ERR(sch))
473 		goto errout;
474 	sch->stats_lock = &dev->queue_lock;
475 	sch->parent = parentid;
476 
477 	if (!ops->init || ops->init(sch, NULL) == 0)
478 		return sch;
479 
480 	qdisc_destroy(sch);
481 errout:
482 	return NULL;
483 }
484 
485 /* Under dev->queue_lock and BH! */
486 
487 void qdisc_reset(struct Qdisc *qdisc)
488 {
489 	struct Qdisc_ops *ops = qdisc->ops;
490 
491 	if (ops->reset)
492 		ops->reset(qdisc);
493 }
494 
495 /* this is the rcu callback function to clean up a qdisc when there
496  * are no further references to it */
497 
498 static void __qdisc_destroy(struct rcu_head *head)
499 {
500 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
501 	kfree((char *) qdisc - qdisc->padded);
502 }
503 
504 /* Under dev->queue_lock and BH! */
505 
506 void qdisc_destroy(struct Qdisc *qdisc)
507 {
508 	struct Qdisc_ops  *ops = qdisc->ops;
509 
510 	if (qdisc->flags & TCQ_F_BUILTIN ||
511 	    !atomic_dec_and_test(&qdisc->refcnt))
512 		return;
513 
514 	list_del(&qdisc->list);
515 #ifdef CONFIG_NET_ESTIMATOR
516 	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
517 #endif
518 	if (ops->reset)
519 		ops->reset(qdisc);
520 	if (ops->destroy)
521 		ops->destroy(qdisc);
522 
523 	module_put(ops->owner);
524 	dev_put(qdisc->dev);
525 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
526 }
527 
528 void dev_activate(struct net_device *dev)
529 {
530 	/* No queueing discipline is attached to device;
531 	   create default one i.e. pfifo_fast for devices,
532 	   which need queueing and noqueue_qdisc for
533 	   virtual interfaces
534 	 */
535 
536 	if (dev->qdisc_sleeping == &noop_qdisc) {
537 		struct Qdisc *qdisc;
538 		if (dev->tx_queue_len) {
539 			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
540 						  TC_H_ROOT);
541 			if (qdisc == NULL) {
542 				printk(KERN_INFO "%s: activation failed\n", dev->name);
543 				return;
544 			}
545 			list_add_tail(&qdisc->list, &dev->qdisc_list);
546 		} else {
547 			qdisc =  &noqueue_qdisc;
548 		}
549 		dev->qdisc_sleeping = qdisc;
550 	}
551 
552 	if (!netif_carrier_ok(dev))
553 		/* Delay activation until next carrier-on event */
554 		return;
555 
556 	spin_lock_bh(&dev->queue_lock);
557 	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
558 	if (dev->qdisc != &noqueue_qdisc) {
559 		dev->trans_start = jiffies;
560 		dev_watchdog_up(dev);
561 	}
562 	spin_unlock_bh(&dev->queue_lock);
563 }
564 
565 void dev_deactivate(struct net_device *dev)
566 {
567 	struct Qdisc *qdisc;
568 	struct sk_buff *skb;
569 
570 	spin_lock_bh(&dev->queue_lock);
571 	qdisc = dev->qdisc;
572 	dev->qdisc = &noop_qdisc;
573 
574 	qdisc_reset(qdisc);
575 
576 	skb = dev->gso_skb;
577 	dev->gso_skb = NULL;
578 	spin_unlock_bh(&dev->queue_lock);
579 
580 	kfree_skb(skb);
581 
582 	dev_watchdog_down(dev);
583 
584 	/* Wait for outstanding dev_queue_xmit calls. */
585 	synchronize_rcu();
586 
587 	/* Wait for outstanding qdisc_run calls. */
588 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
589 		yield();
590 }
591 
592 void dev_init_scheduler(struct net_device *dev)
593 {
594 	qdisc_lock_tree(dev);
595 	dev->qdisc = &noop_qdisc;
596 	dev->qdisc_sleeping = &noop_qdisc;
597 	INIT_LIST_HEAD(&dev->qdisc_list);
598 	qdisc_unlock_tree(dev);
599 
600 	dev_watchdog_init(dev);
601 }
602 
603 void dev_shutdown(struct net_device *dev)
604 {
605 	struct Qdisc *qdisc;
606 
607 	qdisc_lock_tree(dev);
608 	qdisc = dev->qdisc_sleeping;
609 	dev->qdisc = &noop_qdisc;
610 	dev->qdisc_sleeping = &noop_qdisc;
611 	qdisc_destroy(qdisc);
612 #if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
613 	if ((qdisc = dev->qdisc_ingress) != NULL) {
614 		dev->qdisc_ingress = NULL;
615 		qdisc_destroy(qdisc);
616 	}
617 #endif
618 	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
619 	qdisc_unlock_tree(dev);
620 }
621 
622 EXPORT_SYMBOL(netif_carrier_on);
623 EXPORT_SYMBOL(netif_carrier_off);
624 EXPORT_SYMBOL(noop_qdisc);
625 EXPORT_SYMBOL(qdisc_create_dflt);
626 EXPORT_SYMBOL(qdisc_destroy);
627 EXPORT_SYMBOL(qdisc_reset);
628 EXPORT_SYMBOL(qdisc_lock_tree);
629 EXPORT_SYMBOL(qdisc_unlock_tree);
630