xref: /openbmc/linux/net/sched/sch_netem.c (revision 5f32c314)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct netem_sched_data {
72 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 	struct rb_root t_root;
74 
75 	/* optional qdisc for classful handling (NULL at netem init) */
76 	struct Qdisc	*qdisc;
77 
78 	struct qdisc_watchdog watchdog;
79 
80 	psched_tdiff_t latency;
81 	psched_tdiff_t jitter;
82 
83 	u32 loss;
84 	u32 ecn;
85 	u32 limit;
86 	u32 counter;
87 	u32 gap;
88 	u32 duplicate;
89 	u32 reorder;
90 	u32 corrupt;
91 	u64 rate;
92 	s32 packet_overhead;
93 	u32 cell_size;
94 	struct reciprocal_value cell_size_reciprocal;
95 	s32 cell_overhead;
96 
97 	struct crndstate {
98 		u32 last;
99 		u32 rho;
100 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101 
102 	struct disttable {
103 		u32  size;
104 		s16 table[0];
105 	} *delay_dist;
106 
107 	enum  {
108 		CLG_RANDOM,
109 		CLG_4_STATES,
110 		CLG_GILB_ELL,
111 	} loss_model;
112 
113 	enum {
114 		TX_IN_GAP_PERIOD = 1,
115 		TX_IN_BURST_PERIOD,
116 		LOST_IN_GAP_PERIOD,
117 		LOST_IN_BURST_PERIOD,
118 	} _4_state_model;
119 
120 	/* Correlated Loss Generation models */
121 	struct clgstate {
122 		/* state of the Markov chain */
123 		u8 state;
124 
125 		/* 4-states and Gilbert-Elliot models */
126 		u32 a1;	/* p13 for 4-states or p for GE */
127 		u32 a2;	/* p31 for 4-states or r for GE */
128 		u32 a3;	/* p32 for 4-states or h for GE */
129 		u32 a4;	/* p14 for 4-states or 1-k for GE */
130 		u32 a5; /* p23 used only in 4-states */
131 	} clg;
132 
133 };
134 
135 /* Time stamp put into socket buffer control block
136  * Only valid when skbs are in our internal t(ime)fifo queue.
137  */
138 struct netem_skb_cb {
139 	psched_time_t	time_to_send;
140 	ktime_t		tstamp_save;
141 };
142 
143 /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
144  * to hold a rb_node structure.
145  *
146  * If struct sk_buff layout is changed, the following checks will complain.
147  */
148 static struct rb_node *netem_rb_node(struct sk_buff *skb)
149 {
150 	BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
151 	BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
152 		     offsetof(struct sk_buff, next) + sizeof(skb->next));
153 	BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
154 		     offsetof(struct sk_buff, prev) + sizeof(skb->prev));
155 	BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
156 					      sizeof(skb->prev) +
157 					      sizeof(skb->tstamp));
158 	return (struct rb_node *)&skb->next;
159 }
160 
161 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
162 {
163 	return (struct sk_buff *)rb;
164 }
165 
166 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
167 {
168 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
169 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
170 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 /* init_crandom - initialize correlated random number generator
174  * Use entropy source for initial seed.
175  */
176 static void init_crandom(struct crndstate *state, unsigned long rho)
177 {
178 	state->rho = rho;
179 	state->last = prandom_u32();
180 }
181 
182 /* get_crandom - correlated random number generator
183  * Next number depends on last value.
184  * rho is scaled to avoid floating point.
185  */
186 static u32 get_crandom(struct crndstate *state)
187 {
188 	u64 value, rho;
189 	unsigned long answer;
190 
191 	if (state->rho == 0)	/* no correlation */
192 		return prandom_u32();
193 
194 	value = prandom_u32();
195 	rho = (u64)state->rho + 1;
196 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
197 	state->last = answer;
198 	return answer;
199 }
200 
201 /* loss_4state - 4-state model loss generator
202  * Generates losses according to the 4-state Markov chain adopted in
203  * the GI (General and Intuitive) loss model.
204  */
205 static bool loss_4state(struct netem_sched_data *q)
206 {
207 	struct clgstate *clg = &q->clg;
208 	u32 rnd = prandom_u32();
209 
210 	/*
211 	 * Makes a comparison between rnd and the transition
212 	 * probabilities outgoing from the current state, then decides the
213 	 * next state and if the next packet has to be transmitted or lost.
214 	 * The four states correspond to:
215 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
216 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
217 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
218 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
219 	 */
220 	switch (clg->state) {
221 	case TX_IN_GAP_PERIOD:
222 		if (rnd < clg->a4) {
223 			clg->state = LOST_IN_BURST_PERIOD;
224 			return true;
225 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
226 			clg->state = LOST_IN_GAP_PERIOD;
227 			return true;
228 		} else if (clg->a1 + clg->a4 < rnd) {
229 			clg->state = TX_IN_GAP_PERIOD;
230 		}
231 
232 		break;
233 	case TX_IN_BURST_PERIOD:
234 		if (rnd < clg->a5) {
235 			clg->state = LOST_IN_GAP_PERIOD;
236 			return true;
237 		} else {
238 			clg->state = TX_IN_BURST_PERIOD;
239 		}
240 
241 		break;
242 	case LOST_IN_GAP_PERIOD:
243 		if (rnd < clg->a3)
244 			clg->state = TX_IN_BURST_PERIOD;
245 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
246 			clg->state = TX_IN_GAP_PERIOD;
247 		} else if (clg->a2 + clg->a3 < rnd) {
248 			clg->state = LOST_IN_GAP_PERIOD;
249 			return true;
250 		}
251 		break;
252 	case LOST_IN_BURST_PERIOD:
253 		clg->state = TX_IN_GAP_PERIOD;
254 		break;
255 	}
256 
257 	return false;
258 }
259 
260 /* loss_gilb_ell - Gilbert-Elliot model loss generator
261  * Generates losses according to the Gilbert-Elliot loss model or
262  * its special cases  (Gilbert or Simple Gilbert)
263  *
264  * Makes a comparison between random number and the transition
265  * probabilities outgoing from the current state, then decides the
266  * next state. A second random number is extracted and the comparison
267  * with the loss probability of the current state decides if the next
268  * packet will be transmitted or lost.
269  */
270 static bool loss_gilb_ell(struct netem_sched_data *q)
271 {
272 	struct clgstate *clg = &q->clg;
273 
274 	switch (clg->state) {
275 	case 1:
276 		if (prandom_u32() < clg->a1)
277 			clg->state = 2;
278 		if (prandom_u32() < clg->a4)
279 			return true;
280 		break;
281 	case 2:
282 		if (prandom_u32() < clg->a2)
283 			clg->state = 1;
284 		if (prandom_u32() > clg->a3)
285 			return true;
286 	}
287 
288 	return false;
289 }
290 
291 static bool loss_event(struct netem_sched_data *q)
292 {
293 	switch (q->loss_model) {
294 	case CLG_RANDOM:
295 		/* Random packet drop 0 => none, ~0 => all */
296 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
297 
298 	case CLG_4_STATES:
299 		/* 4state loss model algorithm (used also for GI model)
300 		* Extracts a value from the markov 4 state loss generator,
301 		* if it is 1 drops a packet and if needed writes the event in
302 		* the kernel logs
303 		*/
304 		return loss_4state(q);
305 
306 	case CLG_GILB_ELL:
307 		/* Gilbert-Elliot loss model algorithm
308 		* Extracts a value from the Gilbert-Elliot loss generator,
309 		* if it is 1 drops a packet and if needed writes the event in
310 		* the kernel logs
311 		*/
312 		return loss_gilb_ell(q);
313 	}
314 
315 	return false;	/* not reached */
316 }
317 
318 
319 /* tabledist - return a pseudo-randomly distributed value with mean mu and
320  * std deviation sigma.  Uses table lookup to approximate the desired
321  * distribution, and a uniformly-distributed pseudo-random source.
322  */
323 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
324 				struct crndstate *state,
325 				const struct disttable *dist)
326 {
327 	psched_tdiff_t x;
328 	long t;
329 	u32 rnd;
330 
331 	if (sigma == 0)
332 		return mu;
333 
334 	rnd = get_crandom(state);
335 
336 	/* default uniform distribution */
337 	if (dist == NULL)
338 		return (rnd % (2*sigma)) - sigma + mu;
339 
340 	t = dist->table[rnd % dist->size];
341 	x = (sigma % NETEM_DIST_SCALE) * t;
342 	if (x >= 0)
343 		x += NETEM_DIST_SCALE/2;
344 	else
345 		x -= NETEM_DIST_SCALE/2;
346 
347 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
348 }
349 
350 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
351 {
352 	u64 ticks;
353 
354 	len += q->packet_overhead;
355 
356 	if (q->cell_size) {
357 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
358 
359 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
360 			cells++;
361 		len = cells * (q->cell_size + q->cell_overhead);
362 	}
363 
364 	ticks = (u64)len * NSEC_PER_SEC;
365 
366 	do_div(ticks, q->rate);
367 	return PSCHED_NS2TICKS(ticks);
368 }
369 
370 static void tfifo_reset(struct Qdisc *sch)
371 {
372 	struct netem_sched_data *q = qdisc_priv(sch);
373 	struct rb_node *p;
374 
375 	while ((p = rb_first(&q->t_root))) {
376 		struct sk_buff *skb = netem_rb_to_skb(p);
377 
378 		rb_erase(p, &q->t_root);
379 		skb->next = NULL;
380 		skb->prev = NULL;
381 		kfree_skb(skb);
382 	}
383 }
384 
385 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
386 {
387 	struct netem_sched_data *q = qdisc_priv(sch);
388 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
389 	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
390 
391 	while (*p) {
392 		struct sk_buff *skb;
393 
394 		parent = *p;
395 		skb = netem_rb_to_skb(parent);
396 		if (tnext >= netem_skb_cb(skb)->time_to_send)
397 			p = &parent->rb_right;
398 		else
399 			p = &parent->rb_left;
400 	}
401 	rb_link_node(netem_rb_node(nskb), parent, p);
402 	rb_insert_color(netem_rb_node(nskb), &q->t_root);
403 	sch->q.qlen++;
404 }
405 
406 /*
407  * Insert one skb into qdisc.
408  * Note: parent depends on return value to account for queue length.
409  * 	NET_XMIT_DROP: queue length didn't change.
410  *      NET_XMIT_SUCCESS: one skb was queued.
411  */
412 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
413 {
414 	struct netem_sched_data *q = qdisc_priv(sch);
415 	/* We don't fill cb now as skb_unshare() may invalidate it */
416 	struct netem_skb_cb *cb;
417 	struct sk_buff *skb2;
418 	int count = 1;
419 
420 	/* Random duplication */
421 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
422 		++count;
423 
424 	/* Drop packet? */
425 	if (loss_event(q)) {
426 		if (q->ecn && INET_ECN_set_ce(skb))
427 			sch->qstats.drops++; /* mark packet */
428 		else
429 			--count;
430 	}
431 	if (count == 0) {
432 		sch->qstats.drops++;
433 		kfree_skb(skb);
434 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
435 	}
436 
437 	/* If a delay is expected, orphan the skb. (orphaning usually takes
438 	 * place at TX completion time, so _before_ the link transit delay)
439 	 */
440 	if (q->latency || q->jitter)
441 		skb_orphan_partial(skb);
442 
443 	/*
444 	 * If we need to duplicate packet, then re-insert at top of the
445 	 * qdisc tree, since parent queuer expects that only one
446 	 * skb will be queued.
447 	 */
448 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
449 		struct Qdisc *rootq = qdisc_root(sch);
450 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
451 		q->duplicate = 0;
452 
453 		qdisc_enqueue_root(skb2, rootq);
454 		q->duplicate = dupsave;
455 	}
456 
457 	/*
458 	 * Randomized packet corruption.
459 	 * Make copy if needed since we are modifying
460 	 * If packet is going to be hardware checksummed, then
461 	 * do it now in software before we mangle it.
462 	 */
463 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
464 		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
465 		    (skb->ip_summed == CHECKSUM_PARTIAL &&
466 		     skb_checksum_help(skb)))
467 			return qdisc_drop(skb, sch);
468 
469 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
470 			1<<(prandom_u32() % 8);
471 	}
472 
473 	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
474 		return qdisc_reshape_fail(skb, sch);
475 
476 	sch->qstats.backlog += qdisc_pkt_len(skb);
477 
478 	cb = netem_skb_cb(skb);
479 	if (q->gap == 0 ||		/* not doing reordering */
480 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
481 	    q->reorder < get_crandom(&q->reorder_cor)) {
482 		psched_time_t now;
483 		psched_tdiff_t delay;
484 
485 		delay = tabledist(q->latency, q->jitter,
486 				  &q->delay_cor, q->delay_dist);
487 
488 		now = psched_get_time();
489 
490 		if (q->rate) {
491 			struct sk_buff *last;
492 
493 			if (!skb_queue_empty(&sch->q))
494 				last = skb_peek_tail(&sch->q);
495 			else
496 				last = netem_rb_to_skb(rb_last(&q->t_root));
497 			if (last) {
498 				/*
499 				 * Last packet in queue is reference point (now),
500 				 * calculate this time bonus and subtract
501 				 * from delay.
502 				 */
503 				delay -= netem_skb_cb(last)->time_to_send - now;
504 				delay = max_t(psched_tdiff_t, 0, delay);
505 				now = netem_skb_cb(last)->time_to_send;
506 			}
507 
508 			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
509 		}
510 
511 		cb->time_to_send = now + delay;
512 		cb->tstamp_save = skb->tstamp;
513 		++q->counter;
514 		tfifo_enqueue(skb, sch);
515 	} else {
516 		/*
517 		 * Do re-ordering by putting one out of N packets at the front
518 		 * of the queue.
519 		 */
520 		cb->time_to_send = psched_get_time();
521 		q->counter = 0;
522 
523 		__skb_queue_head(&sch->q, skb);
524 		sch->qstats.requeues++;
525 	}
526 
527 	return NET_XMIT_SUCCESS;
528 }
529 
530 static unsigned int netem_drop(struct Qdisc *sch)
531 {
532 	struct netem_sched_data *q = qdisc_priv(sch);
533 	unsigned int len;
534 
535 	len = qdisc_queue_drop(sch);
536 
537 	if (!len) {
538 		struct rb_node *p = rb_first(&q->t_root);
539 
540 		if (p) {
541 			struct sk_buff *skb = netem_rb_to_skb(p);
542 
543 			rb_erase(p, &q->t_root);
544 			sch->q.qlen--;
545 			skb->next = NULL;
546 			skb->prev = NULL;
547 			len = qdisc_pkt_len(skb);
548 			sch->qstats.backlog -= len;
549 			kfree_skb(skb);
550 		}
551 	}
552 	if (!len && q->qdisc && q->qdisc->ops->drop)
553 	    len = q->qdisc->ops->drop(q->qdisc);
554 	if (len)
555 		sch->qstats.drops++;
556 
557 	return len;
558 }
559 
560 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
561 {
562 	struct netem_sched_data *q = qdisc_priv(sch);
563 	struct sk_buff *skb;
564 	struct rb_node *p;
565 
566 	if (qdisc_is_throttled(sch))
567 		return NULL;
568 
569 tfifo_dequeue:
570 	skb = __skb_dequeue(&sch->q);
571 	if (skb) {
572 deliver:
573 		sch->qstats.backlog -= qdisc_pkt_len(skb);
574 		qdisc_unthrottled(sch);
575 		qdisc_bstats_update(sch, skb);
576 		return skb;
577 	}
578 	p = rb_first(&q->t_root);
579 	if (p) {
580 		psched_time_t time_to_send;
581 
582 		skb = netem_rb_to_skb(p);
583 
584 		/* if more time remaining? */
585 		time_to_send = netem_skb_cb(skb)->time_to_send;
586 		if (time_to_send <= psched_get_time()) {
587 			rb_erase(p, &q->t_root);
588 
589 			sch->q.qlen--;
590 			skb->next = NULL;
591 			skb->prev = NULL;
592 			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
593 
594 #ifdef CONFIG_NET_CLS_ACT
595 			/*
596 			 * If it's at ingress let's pretend the delay is
597 			 * from the network (tstamp will be updated).
598 			 */
599 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
600 				skb->tstamp.tv64 = 0;
601 #endif
602 
603 			if (q->qdisc) {
604 				int err = qdisc_enqueue(skb, q->qdisc);
605 
606 				if (unlikely(err != NET_XMIT_SUCCESS)) {
607 					if (net_xmit_drop_count(err)) {
608 						sch->qstats.drops++;
609 						qdisc_tree_decrease_qlen(sch, 1);
610 					}
611 				}
612 				goto tfifo_dequeue;
613 			}
614 			goto deliver;
615 		}
616 
617 		if (q->qdisc) {
618 			skb = q->qdisc->ops->dequeue(q->qdisc);
619 			if (skb)
620 				goto deliver;
621 		}
622 		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
623 	}
624 
625 	if (q->qdisc) {
626 		skb = q->qdisc->ops->dequeue(q->qdisc);
627 		if (skb)
628 			goto deliver;
629 	}
630 	return NULL;
631 }
632 
633 static void netem_reset(struct Qdisc *sch)
634 {
635 	struct netem_sched_data *q = qdisc_priv(sch);
636 
637 	qdisc_reset_queue(sch);
638 	tfifo_reset(sch);
639 	if (q->qdisc)
640 		qdisc_reset(q->qdisc);
641 	qdisc_watchdog_cancel(&q->watchdog);
642 }
643 
644 static void dist_free(struct disttable *d)
645 {
646 	if (d) {
647 		if (is_vmalloc_addr(d))
648 			vfree(d);
649 		else
650 			kfree(d);
651 	}
652 }
653 
654 /*
655  * Distribution data is a variable size payload containing
656  * signed 16 bit values.
657  */
658 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
659 {
660 	struct netem_sched_data *q = qdisc_priv(sch);
661 	size_t n = nla_len(attr)/sizeof(__s16);
662 	const __s16 *data = nla_data(attr);
663 	spinlock_t *root_lock;
664 	struct disttable *d;
665 	int i;
666 	size_t s;
667 
668 	if (n > NETEM_DIST_MAX)
669 		return -EINVAL;
670 
671 	s = sizeof(struct disttable) + n * sizeof(s16);
672 	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
673 	if (!d)
674 		d = vmalloc(s);
675 	if (!d)
676 		return -ENOMEM;
677 
678 	d->size = n;
679 	for (i = 0; i < n; i++)
680 		d->table[i] = data[i];
681 
682 	root_lock = qdisc_root_sleeping_lock(sch);
683 
684 	spin_lock_bh(root_lock);
685 	swap(q->delay_dist, d);
686 	spin_unlock_bh(root_lock);
687 
688 	dist_free(d);
689 	return 0;
690 }
691 
692 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
693 {
694 	struct netem_sched_data *q = qdisc_priv(sch);
695 	const struct tc_netem_corr *c = nla_data(attr);
696 
697 	init_crandom(&q->delay_cor, c->delay_corr);
698 	init_crandom(&q->loss_cor, c->loss_corr);
699 	init_crandom(&q->dup_cor, c->dup_corr);
700 }
701 
702 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
703 {
704 	struct netem_sched_data *q = qdisc_priv(sch);
705 	const struct tc_netem_reorder *r = nla_data(attr);
706 
707 	q->reorder = r->probability;
708 	init_crandom(&q->reorder_cor, r->correlation);
709 }
710 
711 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
712 {
713 	struct netem_sched_data *q = qdisc_priv(sch);
714 	const struct tc_netem_corrupt *r = nla_data(attr);
715 
716 	q->corrupt = r->probability;
717 	init_crandom(&q->corrupt_cor, r->correlation);
718 }
719 
720 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
721 {
722 	struct netem_sched_data *q = qdisc_priv(sch);
723 	const struct tc_netem_rate *r = nla_data(attr);
724 
725 	q->rate = r->rate;
726 	q->packet_overhead = r->packet_overhead;
727 	q->cell_size = r->cell_size;
728 	q->cell_overhead = r->cell_overhead;
729 	if (q->cell_size)
730 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
731 	else
732 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
733 }
734 
735 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
736 {
737 	struct netem_sched_data *q = qdisc_priv(sch);
738 	const struct nlattr *la;
739 	int rem;
740 
741 	nla_for_each_nested(la, attr, rem) {
742 		u16 type = nla_type(la);
743 
744 		switch (type) {
745 		case NETEM_LOSS_GI: {
746 			const struct tc_netem_gimodel *gi = nla_data(la);
747 
748 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
749 				pr_info("netem: incorrect gi model size\n");
750 				return -EINVAL;
751 			}
752 
753 			q->loss_model = CLG_4_STATES;
754 
755 			q->clg.state = 1;
756 			q->clg.a1 = gi->p13;
757 			q->clg.a2 = gi->p31;
758 			q->clg.a3 = gi->p32;
759 			q->clg.a4 = gi->p14;
760 			q->clg.a5 = gi->p23;
761 			break;
762 		}
763 
764 		case NETEM_LOSS_GE: {
765 			const struct tc_netem_gemodel *ge = nla_data(la);
766 
767 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
768 				pr_info("netem: incorrect ge model size\n");
769 				return -EINVAL;
770 			}
771 
772 			q->loss_model = CLG_GILB_ELL;
773 			q->clg.state = 1;
774 			q->clg.a1 = ge->p;
775 			q->clg.a2 = ge->r;
776 			q->clg.a3 = ge->h;
777 			q->clg.a4 = ge->k1;
778 			break;
779 		}
780 
781 		default:
782 			pr_info("netem: unknown loss type %u\n", type);
783 			return -EINVAL;
784 		}
785 	}
786 
787 	return 0;
788 }
789 
790 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
791 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
792 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
793 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
794 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
795 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
796 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
797 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
798 };
799 
800 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
801 		      const struct nla_policy *policy, int len)
802 {
803 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
804 
805 	if (nested_len < 0) {
806 		pr_info("netem: invalid attributes len %d\n", nested_len);
807 		return -EINVAL;
808 	}
809 
810 	if (nested_len >= nla_attr_size(0))
811 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
812 				 nested_len, policy);
813 
814 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
815 	return 0;
816 }
817 
818 /* Parse netlink message to set options */
819 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
820 {
821 	struct netem_sched_data *q = qdisc_priv(sch);
822 	struct nlattr *tb[TCA_NETEM_MAX + 1];
823 	struct tc_netem_qopt *qopt;
824 	int ret;
825 
826 	if (opt == NULL)
827 		return -EINVAL;
828 
829 	qopt = nla_data(opt);
830 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
831 	if (ret < 0)
832 		return ret;
833 
834 	sch->limit = qopt->limit;
835 
836 	q->latency = qopt->latency;
837 	q->jitter = qopt->jitter;
838 	q->limit = qopt->limit;
839 	q->gap = qopt->gap;
840 	q->counter = 0;
841 	q->loss = qopt->loss;
842 	q->duplicate = qopt->duplicate;
843 
844 	/* for compatibility with earlier versions.
845 	 * if gap is set, need to assume 100% probability
846 	 */
847 	if (q->gap)
848 		q->reorder = ~0;
849 
850 	if (tb[TCA_NETEM_CORR])
851 		get_correlation(sch, tb[TCA_NETEM_CORR]);
852 
853 	if (tb[TCA_NETEM_DELAY_DIST]) {
854 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
855 		if (ret)
856 			return ret;
857 	}
858 
859 	if (tb[TCA_NETEM_REORDER])
860 		get_reorder(sch, tb[TCA_NETEM_REORDER]);
861 
862 	if (tb[TCA_NETEM_CORRUPT])
863 		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
864 
865 	if (tb[TCA_NETEM_RATE])
866 		get_rate(sch, tb[TCA_NETEM_RATE]);
867 
868 	if (tb[TCA_NETEM_RATE64])
869 		q->rate = max_t(u64, q->rate,
870 				nla_get_u64(tb[TCA_NETEM_RATE64]));
871 
872 	if (tb[TCA_NETEM_ECN])
873 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
874 
875 	q->loss_model = CLG_RANDOM;
876 	if (tb[TCA_NETEM_LOSS])
877 		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
878 
879 	return ret;
880 }
881 
882 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
883 {
884 	struct netem_sched_data *q = qdisc_priv(sch);
885 	int ret;
886 
887 	if (!opt)
888 		return -EINVAL;
889 
890 	qdisc_watchdog_init(&q->watchdog, sch);
891 
892 	q->loss_model = CLG_RANDOM;
893 	ret = netem_change(sch, opt);
894 	if (ret)
895 		pr_info("netem: change failed\n");
896 	return ret;
897 }
898 
899 static void netem_destroy(struct Qdisc *sch)
900 {
901 	struct netem_sched_data *q = qdisc_priv(sch);
902 
903 	qdisc_watchdog_cancel(&q->watchdog);
904 	if (q->qdisc)
905 		qdisc_destroy(q->qdisc);
906 	dist_free(q->delay_dist);
907 }
908 
909 static int dump_loss_model(const struct netem_sched_data *q,
910 			   struct sk_buff *skb)
911 {
912 	struct nlattr *nest;
913 
914 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
915 	if (nest == NULL)
916 		goto nla_put_failure;
917 
918 	switch (q->loss_model) {
919 	case CLG_RANDOM:
920 		/* legacy loss model */
921 		nla_nest_cancel(skb, nest);
922 		return 0;	/* no data */
923 
924 	case CLG_4_STATES: {
925 		struct tc_netem_gimodel gi = {
926 			.p13 = q->clg.a1,
927 			.p31 = q->clg.a2,
928 			.p32 = q->clg.a3,
929 			.p14 = q->clg.a4,
930 			.p23 = q->clg.a5,
931 		};
932 
933 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
934 			goto nla_put_failure;
935 		break;
936 	}
937 	case CLG_GILB_ELL: {
938 		struct tc_netem_gemodel ge = {
939 			.p = q->clg.a1,
940 			.r = q->clg.a2,
941 			.h = q->clg.a3,
942 			.k1 = q->clg.a4,
943 		};
944 
945 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
946 			goto nla_put_failure;
947 		break;
948 	}
949 	}
950 
951 	nla_nest_end(skb, nest);
952 	return 0;
953 
954 nla_put_failure:
955 	nla_nest_cancel(skb, nest);
956 	return -1;
957 }
958 
959 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
960 {
961 	const struct netem_sched_data *q = qdisc_priv(sch);
962 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
963 	struct tc_netem_qopt qopt;
964 	struct tc_netem_corr cor;
965 	struct tc_netem_reorder reorder;
966 	struct tc_netem_corrupt corrupt;
967 	struct tc_netem_rate rate;
968 
969 	qopt.latency = q->latency;
970 	qopt.jitter = q->jitter;
971 	qopt.limit = q->limit;
972 	qopt.loss = q->loss;
973 	qopt.gap = q->gap;
974 	qopt.duplicate = q->duplicate;
975 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
976 		goto nla_put_failure;
977 
978 	cor.delay_corr = q->delay_cor.rho;
979 	cor.loss_corr = q->loss_cor.rho;
980 	cor.dup_corr = q->dup_cor.rho;
981 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
982 		goto nla_put_failure;
983 
984 	reorder.probability = q->reorder;
985 	reorder.correlation = q->reorder_cor.rho;
986 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
987 		goto nla_put_failure;
988 
989 	corrupt.probability = q->corrupt;
990 	corrupt.correlation = q->corrupt_cor.rho;
991 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
992 		goto nla_put_failure;
993 
994 	if (q->rate >= (1ULL << 32)) {
995 		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
996 			goto nla_put_failure;
997 		rate.rate = ~0U;
998 	} else {
999 		rate.rate = q->rate;
1000 	}
1001 	rate.packet_overhead = q->packet_overhead;
1002 	rate.cell_size = q->cell_size;
1003 	rate.cell_overhead = q->cell_overhead;
1004 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1005 		goto nla_put_failure;
1006 
1007 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1008 		goto nla_put_failure;
1009 
1010 	if (dump_loss_model(q, skb) != 0)
1011 		goto nla_put_failure;
1012 
1013 	return nla_nest_end(skb, nla);
1014 
1015 nla_put_failure:
1016 	nlmsg_trim(skb, nla);
1017 	return -1;
1018 }
1019 
1020 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1021 			  struct sk_buff *skb, struct tcmsg *tcm)
1022 {
1023 	struct netem_sched_data *q = qdisc_priv(sch);
1024 
1025 	if (cl != 1 || !q->qdisc) 	/* only one class */
1026 		return -ENOENT;
1027 
1028 	tcm->tcm_handle |= TC_H_MIN(1);
1029 	tcm->tcm_info = q->qdisc->handle;
1030 
1031 	return 0;
1032 }
1033 
1034 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1035 		     struct Qdisc **old)
1036 {
1037 	struct netem_sched_data *q = qdisc_priv(sch);
1038 
1039 	sch_tree_lock(sch);
1040 	*old = q->qdisc;
1041 	q->qdisc = new;
1042 	if (*old) {
1043 		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1044 		qdisc_reset(*old);
1045 	}
1046 	sch_tree_unlock(sch);
1047 
1048 	return 0;
1049 }
1050 
1051 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1052 {
1053 	struct netem_sched_data *q = qdisc_priv(sch);
1054 	return q->qdisc;
1055 }
1056 
1057 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1058 {
1059 	return 1;
1060 }
1061 
1062 static void netem_put(struct Qdisc *sch, unsigned long arg)
1063 {
1064 }
1065 
1066 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1067 {
1068 	if (!walker->stop) {
1069 		if (walker->count >= walker->skip)
1070 			if (walker->fn(sch, 1, walker) < 0) {
1071 				walker->stop = 1;
1072 				return;
1073 			}
1074 		walker->count++;
1075 	}
1076 }
1077 
1078 static const struct Qdisc_class_ops netem_class_ops = {
1079 	.graft		=	netem_graft,
1080 	.leaf		=	netem_leaf,
1081 	.get		=	netem_get,
1082 	.put		=	netem_put,
1083 	.walk		=	netem_walk,
1084 	.dump		=	netem_dump_class,
1085 };
1086 
1087 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1088 	.id		=	"netem",
1089 	.cl_ops		=	&netem_class_ops,
1090 	.priv_size	=	sizeof(struct netem_sched_data),
1091 	.enqueue	=	netem_enqueue,
1092 	.dequeue	=	netem_dequeue,
1093 	.peek		=	qdisc_peek_dequeued,
1094 	.drop		=	netem_drop,
1095 	.init		=	netem_init,
1096 	.reset		=	netem_reset,
1097 	.destroy	=	netem_destroy,
1098 	.change		=	netem_change,
1099 	.dump		=	netem_dump,
1100 	.owner		=	THIS_MODULE,
1101 };
1102 
1103 
1104 static int __init netem_module_init(void)
1105 {
1106 	pr_info("netem: version " VERSION "\n");
1107 	return register_qdisc(&netem_qdisc_ops);
1108 }
1109 static void __exit netem_module_exit(void)
1110 {
1111 	unregister_qdisc(&netem_qdisc_ops);
1112 }
1113 module_init(netem_module_init)
1114 module_exit(netem_module_exit)
1115 MODULE_LICENSE("GPL");
1116