xref: /openbmc/linux/net/sched/sch_netem.c (revision b6bec26c)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 
27 #include <net/netlink.h>
28 #include <net/pkt_sched.h>
29 #include <net/inet_ecn.h>
30 
31 #define VERSION "1.3"
32 
33 /*	Network Emulation Queuing algorithm.
34 	====================================
35 
36 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
37 		 Network Emulation Tool
38 		 [2] Luigi Rizzo, DummyNet for FreeBSD
39 
40 	 ----------------------------------------------------------------
41 
42 	 This started out as a simple way to delay outgoing packets to
43 	 test TCP but has grown to include most of the functionality
44 	 of a full blown network emulator like NISTnet. It can delay
45 	 packets and add random jitter (and correlation). The random
46 	 distribution can be loaded from a table as well to provide
47 	 normal, Pareto, or experimental curves. Packet loss,
48 	 duplication, and reordering can also be emulated.
49 
50 	 This qdisc does not do classification that can be handled in
51 	 layering other disciplines.  It does not need to do bandwidth
52 	 control either since that can be handled by using token
53 	 bucket or other rate control.
54 
55      Correlated Loss Generator models
56 
57 	Added generation of correlated loss according to the
58 	"Gilbert-Elliot" model, a 4-state markov model.
59 
60 	References:
61 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
62 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
63 	and intuitive loss model for packet networks and its implementation
64 	in the Netem module in the Linux kernel", available in [1]
65 
66 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
67 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
68 */
69 
70 struct netem_sched_data {
71 	/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
72 
73 	/* optional qdisc for classful handling (NULL at netem init) */
74 	struct Qdisc	*qdisc;
75 
76 	struct qdisc_watchdog watchdog;
77 
78 	psched_tdiff_t latency;
79 	psched_tdiff_t jitter;
80 
81 	u32 loss;
82 	u32 ecn;
83 	u32 limit;
84 	u32 counter;
85 	u32 gap;
86 	u32 duplicate;
87 	u32 reorder;
88 	u32 corrupt;
89 	u32 rate;
90 	s32 packet_overhead;
91 	u32 cell_size;
92 	u32 cell_size_reciprocal;
93 	s32 cell_overhead;
94 
95 	struct crndstate {
96 		u32 last;
97 		u32 rho;
98 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
99 
100 	struct disttable {
101 		u32  size;
102 		s16 table[0];
103 	} *delay_dist;
104 
105 	enum  {
106 		CLG_RANDOM,
107 		CLG_4_STATES,
108 		CLG_GILB_ELL,
109 	} loss_model;
110 
111 	/* Correlated Loss Generation models */
112 	struct clgstate {
113 		/* state of the Markov chain */
114 		u8 state;
115 
116 		/* 4-states and Gilbert-Elliot models */
117 		u32 a1;	/* p13 for 4-states or p for GE */
118 		u32 a2;	/* p31 for 4-states or r for GE */
119 		u32 a3;	/* p32 for 4-states or h for GE */
120 		u32 a4;	/* p14 for 4-states or 1-k for GE */
121 		u32 a5; /* p23 used only in 4-states */
122 	} clg;
123 
124 };
125 
126 /* Time stamp put into socket buffer control block
127  * Only valid when skbs are in our internal t(ime)fifo queue.
128  */
129 struct netem_skb_cb {
130 	psched_time_t	time_to_send;
131 };
132 
133 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
134 {
135 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
136 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
137 }
138 
139 /* init_crandom - initialize correlated random number generator
140  * Use entropy source for initial seed.
141  */
142 static void init_crandom(struct crndstate *state, unsigned long rho)
143 {
144 	state->rho = rho;
145 	state->last = net_random();
146 }
147 
148 /* get_crandom - correlated random number generator
149  * Next number depends on last value.
150  * rho is scaled to avoid floating point.
151  */
152 static u32 get_crandom(struct crndstate *state)
153 {
154 	u64 value, rho;
155 	unsigned long answer;
156 
157 	if (state->rho == 0)	/* no correlation */
158 		return net_random();
159 
160 	value = net_random();
161 	rho = (u64)state->rho + 1;
162 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
163 	state->last = answer;
164 	return answer;
165 }
166 
167 /* loss_4state - 4-state model loss generator
168  * Generates losses according to the 4-state Markov chain adopted in
169  * the GI (General and Intuitive) loss model.
170  */
171 static bool loss_4state(struct netem_sched_data *q)
172 {
173 	struct clgstate *clg = &q->clg;
174 	u32 rnd = net_random();
175 
176 	/*
177 	 * Makes a comparison between rnd and the transition
178 	 * probabilities outgoing from the current state, then decides the
179 	 * next state and if the next packet has to be transmitted or lost.
180 	 * The four states correspond to:
181 	 *   1 => successfully transmitted packets within a gap period
182 	 *   4 => isolated losses within a gap period
183 	 *   3 => lost packets within a burst period
184 	 *   2 => successfully transmitted packets within a burst period
185 	 */
186 	switch (clg->state) {
187 	case 1:
188 		if (rnd < clg->a4) {
189 			clg->state = 4;
190 			return true;
191 		} else if (clg->a4 < rnd && rnd < clg->a1) {
192 			clg->state = 3;
193 			return true;
194 		} else if (clg->a1 < rnd)
195 			clg->state = 1;
196 
197 		break;
198 	case 2:
199 		if (rnd < clg->a5) {
200 			clg->state = 3;
201 			return true;
202 		} else
203 			clg->state = 2;
204 
205 		break;
206 	case 3:
207 		if (rnd < clg->a3)
208 			clg->state = 2;
209 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
210 			clg->state = 1;
211 			return true;
212 		} else if (clg->a2 + clg->a3 < rnd) {
213 			clg->state = 3;
214 			return true;
215 		}
216 		break;
217 	case 4:
218 		clg->state = 1;
219 		break;
220 	}
221 
222 	return false;
223 }
224 
225 /* loss_gilb_ell - Gilbert-Elliot model loss generator
226  * Generates losses according to the Gilbert-Elliot loss model or
227  * its special cases  (Gilbert or Simple Gilbert)
228  *
229  * Makes a comparison between random number and the transition
230  * probabilities outgoing from the current state, then decides the
231  * next state. A second random number is extracted and the comparison
232  * with the loss probability of the current state decides if the next
233  * packet will be transmitted or lost.
234  */
235 static bool loss_gilb_ell(struct netem_sched_data *q)
236 {
237 	struct clgstate *clg = &q->clg;
238 
239 	switch (clg->state) {
240 	case 1:
241 		if (net_random() < clg->a1)
242 			clg->state = 2;
243 		if (net_random() < clg->a4)
244 			return true;
245 	case 2:
246 		if (net_random() < clg->a2)
247 			clg->state = 1;
248 		if (clg->a3 > net_random())
249 			return true;
250 	}
251 
252 	return false;
253 }
254 
255 static bool loss_event(struct netem_sched_data *q)
256 {
257 	switch (q->loss_model) {
258 	case CLG_RANDOM:
259 		/* Random packet drop 0 => none, ~0 => all */
260 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
261 
262 	case CLG_4_STATES:
263 		/* 4state loss model algorithm (used also for GI model)
264 		* Extracts a value from the markov 4 state loss generator,
265 		* if it is 1 drops a packet and if needed writes the event in
266 		* the kernel logs
267 		*/
268 		return loss_4state(q);
269 
270 	case CLG_GILB_ELL:
271 		/* Gilbert-Elliot loss model algorithm
272 		* Extracts a value from the Gilbert-Elliot loss generator,
273 		* if it is 1 drops a packet and if needed writes the event in
274 		* the kernel logs
275 		*/
276 		return loss_gilb_ell(q);
277 	}
278 
279 	return false;	/* not reached */
280 }
281 
282 
283 /* tabledist - return a pseudo-randomly distributed value with mean mu and
284  * std deviation sigma.  Uses table lookup to approximate the desired
285  * distribution, and a uniformly-distributed pseudo-random source.
286  */
287 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
288 				struct crndstate *state,
289 				const struct disttable *dist)
290 {
291 	psched_tdiff_t x;
292 	long t;
293 	u32 rnd;
294 
295 	if (sigma == 0)
296 		return mu;
297 
298 	rnd = get_crandom(state);
299 
300 	/* default uniform distribution */
301 	if (dist == NULL)
302 		return (rnd % (2*sigma)) - sigma + mu;
303 
304 	t = dist->table[rnd % dist->size];
305 	x = (sigma % NETEM_DIST_SCALE) * t;
306 	if (x >= 0)
307 		x += NETEM_DIST_SCALE/2;
308 	else
309 		x -= NETEM_DIST_SCALE/2;
310 
311 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
312 }
313 
314 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
315 {
316 	u64 ticks;
317 
318 	len += q->packet_overhead;
319 
320 	if (q->cell_size) {
321 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
322 
323 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
324 			cells++;
325 		len = cells * (q->cell_size + q->cell_overhead);
326 	}
327 
328 	ticks = (u64)len * NSEC_PER_SEC;
329 
330 	do_div(ticks, q->rate);
331 	return PSCHED_NS2TICKS(ticks);
332 }
333 
334 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
335 {
336 	struct sk_buff_head *list = &sch->q;
337 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
338 	struct sk_buff *skb = skb_peek_tail(list);
339 
340 	/* Optimize for add at tail */
341 	if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
342 		return __skb_queue_tail(list, nskb);
343 
344 	skb_queue_reverse_walk(list, skb) {
345 		if (tnext >= netem_skb_cb(skb)->time_to_send)
346 			break;
347 	}
348 
349 	__skb_queue_after(list, skb, nskb);
350 }
351 
352 /*
353  * Insert one skb into qdisc.
354  * Note: parent depends on return value to account for queue length.
355  * 	NET_XMIT_DROP: queue length didn't change.
356  *      NET_XMIT_SUCCESS: one skb was queued.
357  */
358 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
359 {
360 	struct netem_sched_data *q = qdisc_priv(sch);
361 	/* We don't fill cb now as skb_unshare() may invalidate it */
362 	struct netem_skb_cb *cb;
363 	struct sk_buff *skb2;
364 	int count = 1;
365 
366 	/* Random duplication */
367 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
368 		++count;
369 
370 	/* Drop packet? */
371 	if (loss_event(q)) {
372 		if (q->ecn && INET_ECN_set_ce(skb))
373 			sch->qstats.drops++; /* mark packet */
374 		else
375 			--count;
376 	}
377 	if (count == 0) {
378 		sch->qstats.drops++;
379 		kfree_skb(skb);
380 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
381 	}
382 
383 	/* If a delay is expected, orphan the skb. (orphaning usually takes
384 	 * place at TX completion time, so _before_ the link transit delay)
385 	 * Ideally, this orphaning should be done after the rate limiting
386 	 * module, because this breaks TCP Small Queue, and other mechanisms
387 	 * based on socket sk_wmem_alloc.
388 	 */
389 	if (q->latency || q->jitter)
390 		skb_orphan(skb);
391 
392 	/*
393 	 * If we need to duplicate packet, then re-insert at top of the
394 	 * qdisc tree, since parent queuer expects that only one
395 	 * skb will be queued.
396 	 */
397 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
398 		struct Qdisc *rootq = qdisc_root(sch);
399 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
400 		q->duplicate = 0;
401 
402 		qdisc_enqueue_root(skb2, rootq);
403 		q->duplicate = dupsave;
404 	}
405 
406 	/*
407 	 * Randomized packet corruption.
408 	 * Make copy if needed since we are modifying
409 	 * If packet is going to be hardware checksummed, then
410 	 * do it now in software before we mangle it.
411 	 */
412 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
413 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
414 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
415 		     skb_checksum_help(skb)))
416 			return qdisc_drop(skb, sch);
417 
418 		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
419 	}
420 
421 	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
422 		return qdisc_reshape_fail(skb, sch);
423 
424 	sch->qstats.backlog += qdisc_pkt_len(skb);
425 
426 	cb = netem_skb_cb(skb);
427 	if (q->gap == 0 ||		/* not doing reordering */
428 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
429 	    q->reorder < get_crandom(&q->reorder_cor)) {
430 		psched_time_t now;
431 		psched_tdiff_t delay;
432 
433 		delay = tabledist(q->latency, q->jitter,
434 				  &q->delay_cor, q->delay_dist);
435 
436 		now = psched_get_time();
437 
438 		if (q->rate) {
439 			struct sk_buff_head *list = &sch->q;
440 
441 			if (!skb_queue_empty(list)) {
442 				/*
443 				 * Last packet in queue is reference point (now),
444 				 * calculate this time bonus and subtract
445 				 * from delay.
446 				 */
447 				delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now;
448 				delay = max_t(psched_tdiff_t, 0, delay);
449 				now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
450 			}
451 
452 			delay += packet_len_2_sched_time(skb->len, q);
453 		}
454 
455 		cb->time_to_send = now + delay;
456 		++q->counter;
457 		tfifo_enqueue(skb, sch);
458 	} else {
459 		/*
460 		 * Do re-ordering by putting one out of N packets at the front
461 		 * of the queue.
462 		 */
463 		cb->time_to_send = psched_get_time();
464 		q->counter = 0;
465 
466 		__skb_queue_head(&sch->q, skb);
467 		sch->qstats.requeues++;
468 	}
469 
470 	return NET_XMIT_SUCCESS;
471 }
472 
473 static unsigned int netem_drop(struct Qdisc *sch)
474 {
475 	struct netem_sched_data *q = qdisc_priv(sch);
476 	unsigned int len;
477 
478 	len = qdisc_queue_drop(sch);
479 	if (!len && q->qdisc && q->qdisc->ops->drop)
480 	    len = q->qdisc->ops->drop(q->qdisc);
481 	if (len)
482 		sch->qstats.drops++;
483 
484 	return len;
485 }
486 
487 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
488 {
489 	struct netem_sched_data *q = qdisc_priv(sch);
490 	struct sk_buff *skb;
491 
492 	if (qdisc_is_throttled(sch))
493 		return NULL;
494 
495 tfifo_dequeue:
496 	skb = qdisc_peek_head(sch);
497 	if (skb) {
498 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
499 
500 		/* if more time remaining? */
501 		if (cb->time_to_send <= psched_get_time()) {
502 			__skb_unlink(skb, &sch->q);
503 			sch->qstats.backlog -= qdisc_pkt_len(skb);
504 
505 #ifdef CONFIG_NET_CLS_ACT
506 			/*
507 			 * If it's at ingress let's pretend the delay is
508 			 * from the network (tstamp will be updated).
509 			 */
510 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
511 				skb->tstamp.tv64 = 0;
512 #endif
513 
514 			if (q->qdisc) {
515 				int err = qdisc_enqueue(skb, q->qdisc);
516 
517 				if (unlikely(err != NET_XMIT_SUCCESS)) {
518 					if (net_xmit_drop_count(err)) {
519 						sch->qstats.drops++;
520 						qdisc_tree_decrease_qlen(sch, 1);
521 					}
522 				}
523 				goto tfifo_dequeue;
524 			}
525 deliver:
526 			qdisc_unthrottled(sch);
527 			qdisc_bstats_update(sch, skb);
528 			return skb;
529 		}
530 
531 		if (q->qdisc) {
532 			skb = q->qdisc->ops->dequeue(q->qdisc);
533 			if (skb)
534 				goto deliver;
535 		}
536 		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
537 	}
538 
539 	if (q->qdisc) {
540 		skb = q->qdisc->ops->dequeue(q->qdisc);
541 		if (skb)
542 			goto deliver;
543 	}
544 	return NULL;
545 }
546 
547 static void netem_reset(struct Qdisc *sch)
548 {
549 	struct netem_sched_data *q = qdisc_priv(sch);
550 
551 	qdisc_reset_queue(sch);
552 	if (q->qdisc)
553 		qdisc_reset(q->qdisc);
554 	qdisc_watchdog_cancel(&q->watchdog);
555 }
556 
557 static void dist_free(struct disttable *d)
558 {
559 	if (d) {
560 		if (is_vmalloc_addr(d))
561 			vfree(d);
562 		else
563 			kfree(d);
564 	}
565 }
566 
567 /*
568  * Distribution data is a variable size payload containing
569  * signed 16 bit values.
570  */
571 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
572 {
573 	struct netem_sched_data *q = qdisc_priv(sch);
574 	size_t n = nla_len(attr)/sizeof(__s16);
575 	const __s16 *data = nla_data(attr);
576 	spinlock_t *root_lock;
577 	struct disttable *d;
578 	int i;
579 	size_t s;
580 
581 	if (n > NETEM_DIST_MAX)
582 		return -EINVAL;
583 
584 	s = sizeof(struct disttable) + n * sizeof(s16);
585 	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
586 	if (!d)
587 		d = vmalloc(s);
588 	if (!d)
589 		return -ENOMEM;
590 
591 	d->size = n;
592 	for (i = 0; i < n; i++)
593 		d->table[i] = data[i];
594 
595 	root_lock = qdisc_root_sleeping_lock(sch);
596 
597 	spin_lock_bh(root_lock);
598 	swap(q->delay_dist, d);
599 	spin_unlock_bh(root_lock);
600 
601 	dist_free(d);
602 	return 0;
603 }
604 
605 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
606 {
607 	struct netem_sched_data *q = qdisc_priv(sch);
608 	const struct tc_netem_corr *c = nla_data(attr);
609 
610 	init_crandom(&q->delay_cor, c->delay_corr);
611 	init_crandom(&q->loss_cor, c->loss_corr);
612 	init_crandom(&q->dup_cor, c->dup_corr);
613 }
614 
615 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
616 {
617 	struct netem_sched_data *q = qdisc_priv(sch);
618 	const struct tc_netem_reorder *r = nla_data(attr);
619 
620 	q->reorder = r->probability;
621 	init_crandom(&q->reorder_cor, r->correlation);
622 }
623 
624 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
625 {
626 	struct netem_sched_data *q = qdisc_priv(sch);
627 	const struct tc_netem_corrupt *r = nla_data(attr);
628 
629 	q->corrupt = r->probability;
630 	init_crandom(&q->corrupt_cor, r->correlation);
631 }
632 
633 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
634 {
635 	struct netem_sched_data *q = qdisc_priv(sch);
636 	const struct tc_netem_rate *r = nla_data(attr);
637 
638 	q->rate = r->rate;
639 	q->packet_overhead = r->packet_overhead;
640 	q->cell_size = r->cell_size;
641 	if (q->cell_size)
642 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
643 	q->cell_overhead = r->cell_overhead;
644 }
645 
646 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
647 {
648 	struct netem_sched_data *q = qdisc_priv(sch);
649 	const struct nlattr *la;
650 	int rem;
651 
652 	nla_for_each_nested(la, attr, rem) {
653 		u16 type = nla_type(la);
654 
655 		switch(type) {
656 		case NETEM_LOSS_GI: {
657 			const struct tc_netem_gimodel *gi = nla_data(la);
658 
659 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
660 				pr_info("netem: incorrect gi model size\n");
661 				return -EINVAL;
662 			}
663 
664 			q->loss_model = CLG_4_STATES;
665 
666 			q->clg.state = 1;
667 			q->clg.a1 = gi->p13;
668 			q->clg.a2 = gi->p31;
669 			q->clg.a3 = gi->p32;
670 			q->clg.a4 = gi->p14;
671 			q->clg.a5 = gi->p23;
672 			break;
673 		}
674 
675 		case NETEM_LOSS_GE: {
676 			const struct tc_netem_gemodel *ge = nla_data(la);
677 
678 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
679 				pr_info("netem: incorrect ge model size\n");
680 				return -EINVAL;
681 			}
682 
683 			q->loss_model = CLG_GILB_ELL;
684 			q->clg.state = 1;
685 			q->clg.a1 = ge->p;
686 			q->clg.a2 = ge->r;
687 			q->clg.a3 = ge->h;
688 			q->clg.a4 = ge->k1;
689 			break;
690 		}
691 
692 		default:
693 			pr_info("netem: unknown loss type %u\n", type);
694 			return -EINVAL;
695 		}
696 	}
697 
698 	return 0;
699 }
700 
701 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
702 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
703 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
704 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
705 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
706 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
707 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
708 };
709 
710 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
711 		      const struct nla_policy *policy, int len)
712 {
713 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
714 
715 	if (nested_len < 0) {
716 		pr_info("netem: invalid attributes len %d\n", nested_len);
717 		return -EINVAL;
718 	}
719 
720 	if (nested_len >= nla_attr_size(0))
721 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
722 				 nested_len, policy);
723 
724 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
725 	return 0;
726 }
727 
728 /* Parse netlink message to set options */
729 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
730 {
731 	struct netem_sched_data *q = qdisc_priv(sch);
732 	struct nlattr *tb[TCA_NETEM_MAX + 1];
733 	struct tc_netem_qopt *qopt;
734 	int ret;
735 
736 	if (opt == NULL)
737 		return -EINVAL;
738 
739 	qopt = nla_data(opt);
740 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
741 	if (ret < 0)
742 		return ret;
743 
744 	sch->limit = qopt->limit;
745 
746 	q->latency = qopt->latency;
747 	q->jitter = qopt->jitter;
748 	q->limit = qopt->limit;
749 	q->gap = qopt->gap;
750 	q->counter = 0;
751 	q->loss = qopt->loss;
752 	q->duplicate = qopt->duplicate;
753 
754 	/* for compatibility with earlier versions.
755 	 * if gap is set, need to assume 100% probability
756 	 */
757 	if (q->gap)
758 		q->reorder = ~0;
759 
760 	if (tb[TCA_NETEM_CORR])
761 		get_correlation(sch, tb[TCA_NETEM_CORR]);
762 
763 	if (tb[TCA_NETEM_DELAY_DIST]) {
764 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
765 		if (ret)
766 			return ret;
767 	}
768 
769 	if (tb[TCA_NETEM_REORDER])
770 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
771 
772 	if (tb[TCA_NETEM_CORRUPT])
773 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
774 
775 	if (tb[TCA_NETEM_RATE])
776 		get_rate(sch, tb[TCA_NETEM_RATE]);
777 
778 	if (tb[TCA_NETEM_ECN])
779 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
780 
781 	q->loss_model = CLG_RANDOM;
782 	if (tb[TCA_NETEM_LOSS])
783 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
784 
785 	return ret;
786 }
787 
788 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
789 {
790 	struct netem_sched_data *q = qdisc_priv(sch);
791 	int ret;
792 
793 	if (!opt)
794 		return -EINVAL;
795 
796 	qdisc_watchdog_init(&q->watchdog, sch);
797 
798 	q->loss_model = CLG_RANDOM;
799 	ret = netem_change(sch, opt);
800 	if (ret)
801 		pr_info("netem: change failed\n");
802 	return ret;
803 }
804 
805 static void netem_destroy(struct Qdisc *sch)
806 {
807 	struct netem_sched_data *q = qdisc_priv(sch);
808 
809 	qdisc_watchdog_cancel(&q->watchdog);
810 	if (q->qdisc)
811 		qdisc_destroy(q->qdisc);
812 	dist_free(q->delay_dist);
813 }
814 
815 static int dump_loss_model(const struct netem_sched_data *q,
816 			   struct sk_buff *skb)
817 {
818 	struct nlattr *nest;
819 
820 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
821 	if (nest == NULL)
822 		goto nla_put_failure;
823 
824 	switch (q->loss_model) {
825 	case CLG_RANDOM:
826 		/* legacy loss model */
827 		nla_nest_cancel(skb, nest);
828 		return 0;	/* no data */
829 
830 	case CLG_4_STATES: {
831 		struct tc_netem_gimodel gi = {
832 			.p13 = q->clg.a1,
833 			.p31 = q->clg.a2,
834 			.p32 = q->clg.a3,
835 			.p14 = q->clg.a4,
836 			.p23 = q->clg.a5,
837 		};
838 
839 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
840 			goto nla_put_failure;
841 		break;
842 	}
843 	case CLG_GILB_ELL: {
844 		struct tc_netem_gemodel ge = {
845 			.p = q->clg.a1,
846 			.r = q->clg.a2,
847 			.h = q->clg.a3,
848 			.k1 = q->clg.a4,
849 		};
850 
851 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
852 			goto nla_put_failure;
853 		break;
854 	}
855 	}
856 
857 	nla_nest_end(skb, nest);
858 	return 0;
859 
860 nla_put_failure:
861 	nla_nest_cancel(skb, nest);
862 	return -1;
863 }
864 
865 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
866 {
867 	const struct netem_sched_data *q = qdisc_priv(sch);
868 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
869 	struct tc_netem_qopt qopt;
870 	struct tc_netem_corr cor;
871 	struct tc_netem_reorder reorder;
872 	struct tc_netem_corrupt corrupt;
873 	struct tc_netem_rate rate;
874 
875 	qopt.latency = q->latency;
876 	qopt.jitter = q->jitter;
877 	qopt.limit = q->limit;
878 	qopt.loss = q->loss;
879 	qopt.gap = q->gap;
880 	qopt.duplicate = q->duplicate;
881 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
882 		goto nla_put_failure;
883 
884 	cor.delay_corr = q->delay_cor.rho;
885 	cor.loss_corr = q->loss_cor.rho;
886 	cor.dup_corr = q->dup_cor.rho;
887 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
888 		goto nla_put_failure;
889 
890 	reorder.probability = q->reorder;
891 	reorder.correlation = q->reorder_cor.rho;
892 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
893 		goto nla_put_failure;
894 
895 	corrupt.probability = q->corrupt;
896 	corrupt.correlation = q->corrupt_cor.rho;
897 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
898 		goto nla_put_failure;
899 
900 	rate.rate = q->rate;
901 	rate.packet_overhead = q->packet_overhead;
902 	rate.cell_size = q->cell_size;
903 	rate.cell_overhead = q->cell_overhead;
904 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
905 		goto nla_put_failure;
906 
907 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
908 		goto nla_put_failure;
909 
910 	if (dump_loss_model(q, skb) != 0)
911 		goto nla_put_failure;
912 
913 	return nla_nest_end(skb, nla);
914 
915 nla_put_failure:
916 	nlmsg_trim(skb, nla);
917 	return -1;
918 }
919 
920 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
921 			  struct sk_buff *skb, struct tcmsg *tcm)
922 {
923 	struct netem_sched_data *q = qdisc_priv(sch);
924 
925 	if (cl != 1 || !q->qdisc) 	/* only one class */
926 		return -ENOENT;
927 
928 	tcm->tcm_handle |= TC_H_MIN(1);
929 	tcm->tcm_info = q->qdisc->handle;
930 
931 	return 0;
932 }
933 
934 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
935 		     struct Qdisc **old)
936 {
937 	struct netem_sched_data *q = qdisc_priv(sch);
938 
939 	sch_tree_lock(sch);
940 	*old = q->qdisc;
941 	q->qdisc = new;
942 	if (*old) {
943 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
944 		qdisc_reset(*old);
945 	}
946 	sch_tree_unlock(sch);
947 
948 	return 0;
949 }
950 
951 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
952 {
953 	struct netem_sched_data *q = qdisc_priv(sch);
954 	return q->qdisc;
955 }
956 
957 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
958 {
959 	return 1;
960 }
961 
962 static void netem_put(struct Qdisc *sch, unsigned long arg)
963 {
964 }
965 
966 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
967 {
968 	if (!walker->stop) {
969 		if (walker->count >= walker->skip)
970 			if (walker->fn(sch, 1, walker) < 0) {
971 				walker->stop = 1;
972 				return;
973 			}
974 		walker->count++;
975 	}
976 }
977 
978 static const struct Qdisc_class_ops netem_class_ops = {
979 	.graft		=	netem_graft,
980 	.leaf		=	netem_leaf,
981 	.get		=	netem_get,
982 	.put		=	netem_put,
983 	.walk		=	netem_walk,
984 	.dump		=	netem_dump_class,
985 };
986 
987 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
988 	.id		=	"netem",
989 	.cl_ops		=	&netem_class_ops,
990 	.priv_size	=	sizeof(struct netem_sched_data),
991 	.enqueue	=	netem_enqueue,
992 	.dequeue	=	netem_dequeue,
993 	.peek		=	qdisc_peek_dequeued,
994 	.drop		=	netem_drop,
995 	.init		=	netem_init,
996 	.reset		=	netem_reset,
997 	.destroy	=	netem_destroy,
998 	.change		=	netem_change,
999 	.dump		=	netem_dump,
1000 	.owner		=	THIS_MODULE,
1001 };
1002 
1003 
1004 static int __init netem_module_init(void)
1005 {
1006 	pr_info("netem: version " VERSION "\n");
1007 	return register_qdisc(&netem_qdisc_ops);
1008 }
1009 static void __exit netem_module_exit(void)
1010 {
1011 	unregister_qdisc(&netem_qdisc_ops);
1012 }
1013 module_init(netem_module_init)
1014 module_exit(netem_module_exit)
1015 MODULE_LICENSE("GPL");
1016