xref: /openbmc/linux/net/sched/sch_netem.c (revision 2504075d)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/errno.h>
21 #include <linux/skbuff.h>
22 #include <linux/rtnetlink.h>
23 
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 
27 #define VERSION "1.2"
28 
29 /*	Network Emulation Queuing algorithm.
30 	====================================
31 
32 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
33 		 Network Emulation Tool
34 		 [2] Luigi Rizzo, DummyNet for FreeBSD
35 
36 	 ----------------------------------------------------------------
37 
38 	 This started out as a simple way to delay outgoing packets to
39 	 test TCP but has grown to include most of the functionality
40 	 of a full blown network emulator like NISTnet. It can delay
41 	 packets and add random jitter (and correlation). The random
42 	 distribution can be loaded from a table as well to provide
43 	 normal, Pareto, or experimental curves. Packet loss,
44 	 duplication, and reordering can also be emulated.
45 
46 	 This qdisc does not do classification that can be handled in
47 	 layering other disciplines.  It does not need to do bandwidth
48 	 control either since that can be handled by using token
49 	 bucket or other rate control.
50 */
51 
52 struct netem_sched_data {
53 	struct Qdisc	*qdisc;
54 	struct qdisc_watchdog watchdog;
55 
56 	psched_tdiff_t latency;
57 	psched_tdiff_t jitter;
58 
59 	u32 loss;
60 	u32 limit;
61 	u32 counter;
62 	u32 gap;
63 	u32 duplicate;
64 	u32 reorder;
65 	u32 corrupt;
66 
67 	struct crndstate {
68 		u32 last;
69 		u32 rho;
70 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
71 
72 	struct disttable {
73 		u32  size;
74 		s16 table[0];
75 	} *delay_dist;
76 };
77 
78 /* Time stamp put into socket buffer control block */
79 struct netem_skb_cb {
80 	psched_time_t	time_to_send;
81 };
82 
83 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
84 {
85 	BUILD_BUG_ON(sizeof(skb->cb) <
86 		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
87 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
88 }
89 
90 /* init_crandom - initialize correlated random number generator
91  * Use entropy source for initial seed.
92  */
93 static void init_crandom(struct crndstate *state, unsigned long rho)
94 {
95 	state->rho = rho;
96 	state->last = net_random();
97 }
98 
99 /* get_crandom - correlated random number generator
100  * Next number depends on last value.
101  * rho is scaled to avoid floating point.
102  */
103 static u32 get_crandom(struct crndstate *state)
104 {
105 	u64 value, rho;
106 	unsigned long answer;
107 
108 	if (state->rho == 0)	/* no correlation */
109 		return net_random();
110 
111 	value = net_random();
112 	rho = (u64)state->rho + 1;
113 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
114 	state->last = answer;
115 	return answer;
116 }
117 
118 /* tabledist - return a pseudo-randomly distributed value with mean mu and
119  * std deviation sigma.  Uses table lookup to approximate the desired
120  * distribution, and a uniformly-distributed pseudo-random source.
121  */
122 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
123 				struct crndstate *state,
124 				const struct disttable *dist)
125 {
126 	psched_tdiff_t x;
127 	long t;
128 	u32 rnd;
129 
130 	if (sigma == 0)
131 		return mu;
132 
133 	rnd = get_crandom(state);
134 
135 	/* default uniform distribution */
136 	if (dist == NULL)
137 		return (rnd % (2*sigma)) - sigma + mu;
138 
139 	t = dist->table[rnd % dist->size];
140 	x = (sigma % NETEM_DIST_SCALE) * t;
141 	if (x >= 0)
142 		x += NETEM_DIST_SCALE/2;
143 	else
144 		x -= NETEM_DIST_SCALE/2;
145 
146 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
147 }
148 
149 /*
150  * Insert one skb into qdisc.
151  * Note: parent depends on return value to account for queue length.
152  * 	NET_XMIT_DROP: queue length didn't change.
153  *      NET_XMIT_SUCCESS: one skb was queued.
154  */
155 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
156 {
157 	struct netem_sched_data *q = qdisc_priv(sch);
158 	/* We don't fill cb now as skb_unshare() may invalidate it */
159 	struct netem_skb_cb *cb;
160 	struct sk_buff *skb2;
161 	int ret;
162 	int count = 1;
163 
164 	pr_debug("netem_enqueue skb=%p\n", skb);
165 
166 	/* Random duplication */
167 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
168 		++count;
169 
170 	/* Random packet drop 0 => none, ~0 => all */
171 	if (q->loss && q->loss >= get_crandom(&q->loss_cor))
172 		--count;
173 
174 	if (count == 0) {
175 		sch->qstats.drops++;
176 		kfree_skb(skb);
177 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
178 	}
179 
180 	skb_orphan(skb);
181 
182 	/*
183 	 * If we need to duplicate packet, then re-insert at top of the
184 	 * qdisc tree, since parent queuer expects that only one
185 	 * skb will be queued.
186 	 */
187 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
188 		struct Qdisc *rootq = qdisc_root(sch);
189 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
190 		q->duplicate = 0;
191 
192 		qdisc_enqueue_root(skb2, rootq);
193 		q->duplicate = dupsave;
194 	}
195 
196 	/*
197 	 * Randomized packet corruption.
198 	 * Make copy if needed since we are modifying
199 	 * If packet is going to be hardware checksummed, then
200 	 * do it now in software before we mangle it.
201 	 */
202 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
203 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
204 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
205 		     skb_checksum_help(skb))) {
206 			sch->qstats.drops++;
207 			return NET_XMIT_DROP;
208 		}
209 
210 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
211 	}
212 
213 	cb = netem_skb_cb(skb);
214 	if (q->gap == 0 || 		/* not doing reordering */
215 	    q->counter < q->gap || 	/* inside last reordering gap */
216 	    q->reorder < get_crandom(&q->reorder_cor)) {
217 		psched_time_t now;
218 		psched_tdiff_t delay;
219 
220 		delay = tabledist(q->latency, q->jitter,
221 				  &q->delay_cor, q->delay_dist);
222 
223 		now = psched_get_time();
224 		cb->time_to_send = now + delay;
225 		++q->counter;
226 		ret = qdisc_enqueue(skb, q->qdisc);
227 	} else {
228 		/*
229 		 * Do re-ordering by putting one out of N packets at the front
230 		 * of the queue.
231 		 */
232 		cb->time_to_send = psched_get_time();
233 		q->counter = 0;
234 
235 		__skb_queue_head(&q->qdisc->q, skb);
236 		q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
237 		q->qdisc->qstats.requeues++;
238 		ret = NET_XMIT_SUCCESS;
239 	}
240 
241 	if (likely(ret == NET_XMIT_SUCCESS)) {
242 		sch->q.qlen++;
243 		sch->bstats.bytes += qdisc_pkt_len(skb);
244 		sch->bstats.packets++;
245 	} else if (net_xmit_drop_count(ret)) {
246 		sch->qstats.drops++;
247 	}
248 
249 	pr_debug("netem: enqueue ret %d\n", ret);
250 	return ret;
251 }
252 
253 static unsigned int netem_drop(struct Qdisc* sch)
254 {
255 	struct netem_sched_data *q = qdisc_priv(sch);
256 	unsigned int len = 0;
257 
258 	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
259 		sch->q.qlen--;
260 		sch->qstats.drops++;
261 	}
262 	return len;
263 }
264 
265 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
266 {
267 	struct netem_sched_data *q = qdisc_priv(sch);
268 	struct sk_buff *skb;
269 
270 	if (sch->flags & TCQ_F_THROTTLED)
271 		return NULL;
272 
273 	skb = q->qdisc->ops->peek(q->qdisc);
274 	if (skb) {
275 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
276 		psched_time_t now = psched_get_time();
277 
278 		/* if more time remaining? */
279 		if (cb->time_to_send <= now) {
280 			skb = qdisc_dequeue_peeked(q->qdisc);
281 			if (unlikely(!skb))
282 				return NULL;
283 
284 #ifdef CONFIG_NET_CLS_ACT
285 			/*
286 			 * If it's at ingress let's pretend the delay is
287 			 * from the network (tstamp will be updated).
288 			 */
289 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
290 				skb->tstamp.tv64 = 0;
291 #endif
292 			pr_debug("netem_dequeue: return skb=%p\n", skb);
293 			sch->q.qlen--;
294 			return skb;
295 		}
296 
297 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
298 	}
299 
300 	return NULL;
301 }
302 
303 static void netem_reset(struct Qdisc *sch)
304 {
305 	struct netem_sched_data *q = qdisc_priv(sch);
306 
307 	qdisc_reset(q->qdisc);
308 	sch->q.qlen = 0;
309 	qdisc_watchdog_cancel(&q->watchdog);
310 }
311 
312 /*
313  * Distribution data is a variable size payload containing
314  * signed 16 bit values.
315  */
316 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
317 {
318 	struct netem_sched_data *q = qdisc_priv(sch);
319 	unsigned long n = nla_len(attr)/sizeof(__s16);
320 	const __s16 *data = nla_data(attr);
321 	spinlock_t *root_lock;
322 	struct disttable *d;
323 	int i;
324 
325 	if (n > 65536)
326 		return -EINVAL;
327 
328 	d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
329 	if (!d)
330 		return -ENOMEM;
331 
332 	d->size = n;
333 	for (i = 0; i < n; i++)
334 		d->table[i] = data[i];
335 
336 	root_lock = qdisc_root_sleeping_lock(sch);
337 
338 	spin_lock_bh(root_lock);
339 	kfree(q->delay_dist);
340 	q->delay_dist = d;
341 	spin_unlock_bh(root_lock);
342 	return 0;
343 }
344 
345 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
346 {
347 	struct netem_sched_data *q = qdisc_priv(sch);
348 	const struct tc_netem_corr *c = nla_data(attr);
349 
350 	init_crandom(&q->delay_cor, c->delay_corr);
351 	init_crandom(&q->loss_cor, c->loss_corr);
352 	init_crandom(&q->dup_cor, c->dup_corr);
353 }
354 
355 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
356 {
357 	struct netem_sched_data *q = qdisc_priv(sch);
358 	const struct tc_netem_reorder *r = nla_data(attr);
359 
360 	q->reorder = r->probability;
361 	init_crandom(&q->reorder_cor, r->correlation);
362 }
363 
364 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
365 {
366 	struct netem_sched_data *q = qdisc_priv(sch);
367 	const struct tc_netem_corrupt *r = nla_data(attr);
368 
369 	q->corrupt = r->probability;
370 	init_crandom(&q->corrupt_cor, r->correlation);
371 }
372 
373 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
374 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
375 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
376 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
377 };
378 
379 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
380 		      const struct nla_policy *policy, int len)
381 {
382 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
383 
384 	if (nested_len < 0)
385 		return -EINVAL;
386 	if (nested_len >= nla_attr_size(0))
387 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
388 				 nested_len, policy);
389 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
390 	return 0;
391 }
392 
393 /* Parse netlink message to set options */
394 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
395 {
396 	struct netem_sched_data *q = qdisc_priv(sch);
397 	struct nlattr *tb[TCA_NETEM_MAX + 1];
398 	struct tc_netem_qopt *qopt;
399 	int ret;
400 
401 	if (opt == NULL)
402 		return -EINVAL;
403 
404 	qopt = nla_data(opt);
405 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
406 	if (ret < 0)
407 		return ret;
408 
409 	ret = fifo_set_limit(q->qdisc, qopt->limit);
410 	if (ret) {
411 		pr_debug("netem: can't set fifo limit\n");
412 		return ret;
413 	}
414 
415 	q->latency = qopt->latency;
416 	q->jitter = qopt->jitter;
417 	q->limit = qopt->limit;
418 	q->gap = qopt->gap;
419 	q->counter = 0;
420 	q->loss = qopt->loss;
421 	q->duplicate = qopt->duplicate;
422 
423 	/* for compatibility with earlier versions.
424 	 * if gap is set, need to assume 100% probability
425 	 */
426 	if (q->gap)
427 		q->reorder = ~0;
428 
429 	if (tb[TCA_NETEM_CORR])
430 		get_correlation(sch, tb[TCA_NETEM_CORR]);
431 
432 	if (tb[TCA_NETEM_DELAY_DIST]) {
433 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
434 		if (ret)
435 			return ret;
436 	}
437 
438 	if (tb[TCA_NETEM_REORDER])
439 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
440 
441 	if (tb[TCA_NETEM_CORRUPT])
442 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
443 
444 	return 0;
445 }
446 
447 /*
448  * Special case version of FIFO queue for use by netem.
449  * It queues in order based on timestamps in skb's
450  */
451 struct fifo_sched_data {
452 	u32 limit;
453 	psched_time_t oldest;
454 };
455 
456 static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
457 {
458 	struct fifo_sched_data *q = qdisc_priv(sch);
459 	struct sk_buff_head *list = &sch->q;
460 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
461 	struct sk_buff *skb;
462 
463 	if (likely(skb_queue_len(list) < q->limit)) {
464 		/* Optimize for add at tail */
465 		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
466 			q->oldest = tnext;
467 			return qdisc_enqueue_tail(nskb, sch);
468 		}
469 
470 		skb_queue_reverse_walk(list, skb) {
471 			const struct netem_skb_cb *cb = netem_skb_cb(skb);
472 
473 			if (tnext >= cb->time_to_send)
474 				break;
475 		}
476 
477 		__skb_queue_after(list, skb, nskb);
478 
479 		sch->qstats.backlog += qdisc_pkt_len(nskb);
480 		sch->bstats.bytes += qdisc_pkt_len(nskb);
481 		sch->bstats.packets++;
482 
483 		return NET_XMIT_SUCCESS;
484 	}
485 
486 	return qdisc_reshape_fail(nskb, sch);
487 }
488 
489 static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
490 {
491 	struct fifo_sched_data *q = qdisc_priv(sch);
492 
493 	if (opt) {
494 		struct tc_fifo_qopt *ctl = nla_data(opt);
495 		if (nla_len(opt) < sizeof(*ctl))
496 			return -EINVAL;
497 
498 		q->limit = ctl->limit;
499 	} else
500 		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
501 
502 	q->oldest = PSCHED_PASTPERFECT;
503 	return 0;
504 }
505 
506 static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
507 {
508 	struct fifo_sched_data *q = qdisc_priv(sch);
509 	struct tc_fifo_qopt opt = { .limit = q->limit };
510 
511 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
512 	return skb->len;
513 
514 nla_put_failure:
515 	return -1;
516 }
517 
518 static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
519 	.id		=	"tfifo",
520 	.priv_size	=	sizeof(struct fifo_sched_data),
521 	.enqueue	=	tfifo_enqueue,
522 	.dequeue	=	qdisc_dequeue_head,
523 	.peek		=	qdisc_peek_head,
524 	.drop		=	qdisc_queue_drop,
525 	.init		=	tfifo_init,
526 	.reset		=	qdisc_reset_queue,
527 	.change		=	tfifo_init,
528 	.dump		=	tfifo_dump,
529 };
530 
531 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
532 {
533 	struct netem_sched_data *q = qdisc_priv(sch);
534 	int ret;
535 
536 	if (!opt)
537 		return -EINVAL;
538 
539 	qdisc_watchdog_init(&q->watchdog, sch);
540 
541 	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
542 				     TC_H_MAKE(sch->handle, 1));
543 	if (!q->qdisc) {
544 		pr_debug("netem: qdisc create failed\n");
545 		return -ENOMEM;
546 	}
547 
548 	ret = netem_change(sch, opt);
549 	if (ret) {
550 		pr_debug("netem: change failed\n");
551 		qdisc_destroy(q->qdisc);
552 	}
553 	return ret;
554 }
555 
556 static void netem_destroy(struct Qdisc *sch)
557 {
558 	struct netem_sched_data *q = qdisc_priv(sch);
559 
560 	qdisc_watchdog_cancel(&q->watchdog);
561 	qdisc_destroy(q->qdisc);
562 	kfree(q->delay_dist);
563 }
564 
565 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
566 {
567 	const struct netem_sched_data *q = qdisc_priv(sch);
568 	unsigned char *b = skb_tail_pointer(skb);
569 	struct nlattr *nla = (struct nlattr *) b;
570 	struct tc_netem_qopt qopt;
571 	struct tc_netem_corr cor;
572 	struct tc_netem_reorder reorder;
573 	struct tc_netem_corrupt corrupt;
574 
575 	qopt.latency = q->latency;
576 	qopt.jitter = q->jitter;
577 	qopt.limit = q->limit;
578 	qopt.loss = q->loss;
579 	qopt.gap = q->gap;
580 	qopt.duplicate = q->duplicate;
581 	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
582 
583 	cor.delay_corr = q->delay_cor.rho;
584 	cor.loss_corr = q->loss_cor.rho;
585 	cor.dup_corr = q->dup_cor.rho;
586 	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
587 
588 	reorder.probability = q->reorder;
589 	reorder.correlation = q->reorder_cor.rho;
590 	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
591 
592 	corrupt.probability = q->corrupt;
593 	corrupt.correlation = q->corrupt_cor.rho;
594 	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
595 
596 	nla->nla_len = skb_tail_pointer(skb) - b;
597 
598 	return skb->len;
599 
600 nla_put_failure:
601 	nlmsg_trim(skb, b);
602 	return -1;
603 }
604 
605 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
606 	.id		=	"netem",
607 	.priv_size	=	sizeof(struct netem_sched_data),
608 	.enqueue	=	netem_enqueue,
609 	.dequeue	=	netem_dequeue,
610 	.peek		=	qdisc_peek_dequeued,
611 	.drop		=	netem_drop,
612 	.init		=	netem_init,
613 	.reset		=	netem_reset,
614 	.destroy	=	netem_destroy,
615 	.change		=	netem_change,
616 	.dump		=	netem_dump,
617 	.owner		=	THIS_MODULE,
618 };
619 
620 
621 static int __init netem_module_init(void)
622 {
623 	pr_info("netem: version " VERSION "\n");
624 	return register_qdisc(&netem_qdisc_ops);
625 }
626 static void __exit netem_module_exit(void)
627 {
628 	unregister_qdisc(&netem_qdisc_ops);
629 }
630 module_init(netem_module_init)
631 module_exit(netem_module_exit)
632 MODULE_LICENSE("GPL");
633