xref: /openbmc/linux/net/sched/sch_netem.c (revision 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct netem_sched_data {
72 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 	struct rb_root t_root;
74 
75 	/* optional qdisc for classful handling (NULL at netem init) */
76 	struct Qdisc	*qdisc;
77 
78 	struct qdisc_watchdog watchdog;
79 
80 	psched_tdiff_t latency;
81 	psched_tdiff_t jitter;
82 
83 	u32 loss;
84 	u32 ecn;
85 	u32 limit;
86 	u32 counter;
87 	u32 gap;
88 	u32 duplicate;
89 	u32 reorder;
90 	u32 corrupt;
91 	u64 rate;
92 	s32 packet_overhead;
93 	u32 cell_size;
94 	struct reciprocal_value cell_size_reciprocal;
95 	s32 cell_overhead;
96 
97 	struct crndstate {
98 		u32 last;
99 		u32 rho;
100 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101 
102 	struct disttable {
103 		u32  size;
104 		s16 table[0];
105 	} *delay_dist;
106 
107 	enum  {
108 		CLG_RANDOM,
109 		CLG_4_STATES,
110 		CLG_GILB_ELL,
111 	} loss_model;
112 
113 	enum {
114 		TX_IN_GAP_PERIOD = 1,
115 		TX_IN_BURST_PERIOD,
116 		LOST_IN_GAP_PERIOD,
117 		LOST_IN_BURST_PERIOD,
118 	} _4_state_model;
119 
120 	enum {
121 		GOOD_STATE = 1,
122 		BAD_STATE,
123 	} GE_state_model;
124 
125 	/* Correlated Loss Generation models */
126 	struct clgstate {
127 		/* state of the Markov chain */
128 		u8 state;
129 
130 		/* 4-states and Gilbert-Elliot models */
131 		u32 a1;	/* p13 for 4-states or p for GE */
132 		u32 a2;	/* p31 for 4-states or r for GE */
133 		u32 a3;	/* p32 for 4-states or h for GE */
134 		u32 a4;	/* p14 for 4-states or 1-k for GE */
135 		u32 a5; /* p23 used only in 4-states */
136 	} clg;
137 
138 };
139 
140 /* Time stamp put into socket buffer control block
141  * Only valid when skbs are in our internal t(ime)fifo queue.
142  *
143  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
144  * and skb->next & skb->prev are scratch space for a qdisc,
145  * we save skb->tstamp value in skb->cb[] before destroying it.
146  */
147 struct netem_skb_cb {
148 	psched_time_t	time_to_send;
149 };
150 
151 
152 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
153 {
154 	return rb_entry(rb, struct sk_buff, rbnode);
155 }
156 
157 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
158 {
159 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
160 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
161 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
162 }
163 
164 /* init_crandom - initialize correlated random number generator
165  * Use entropy source for initial seed.
166  */
167 static void init_crandom(struct crndstate *state, unsigned long rho)
168 {
169 	state->rho = rho;
170 	state->last = prandom_u32();
171 }
172 
173 /* get_crandom - correlated random number generator
174  * Next number depends on last value.
175  * rho is scaled to avoid floating point.
176  */
177 static u32 get_crandom(struct crndstate *state)
178 {
179 	u64 value, rho;
180 	unsigned long answer;
181 
182 	if (state->rho == 0)	/* no correlation */
183 		return prandom_u32();
184 
185 	value = prandom_u32();
186 	rho = (u64)state->rho + 1;
187 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
188 	state->last = answer;
189 	return answer;
190 }
191 
192 /* loss_4state - 4-state model loss generator
193  * Generates losses according to the 4-state Markov chain adopted in
194  * the GI (General and Intuitive) loss model.
195  */
196 static bool loss_4state(struct netem_sched_data *q)
197 {
198 	struct clgstate *clg = &q->clg;
199 	u32 rnd = prandom_u32();
200 
201 	/*
202 	 * Makes a comparison between rnd and the transition
203 	 * probabilities outgoing from the current state, then decides the
204 	 * next state and if the next packet has to be transmitted or lost.
205 	 * The four states correspond to:
206 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
207 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
208 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
209 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
210 	 */
211 	switch (clg->state) {
212 	case TX_IN_GAP_PERIOD:
213 		if (rnd < clg->a4) {
214 			clg->state = LOST_IN_BURST_PERIOD;
215 			return true;
216 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
217 			clg->state = LOST_IN_GAP_PERIOD;
218 			return true;
219 		} else if (clg->a1 + clg->a4 < rnd) {
220 			clg->state = TX_IN_GAP_PERIOD;
221 		}
222 
223 		break;
224 	case TX_IN_BURST_PERIOD:
225 		if (rnd < clg->a5) {
226 			clg->state = LOST_IN_GAP_PERIOD;
227 			return true;
228 		} else {
229 			clg->state = TX_IN_BURST_PERIOD;
230 		}
231 
232 		break;
233 	case LOST_IN_GAP_PERIOD:
234 		if (rnd < clg->a3)
235 			clg->state = TX_IN_BURST_PERIOD;
236 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
237 			clg->state = TX_IN_GAP_PERIOD;
238 		} else if (clg->a2 + clg->a3 < rnd) {
239 			clg->state = LOST_IN_GAP_PERIOD;
240 			return true;
241 		}
242 		break;
243 	case LOST_IN_BURST_PERIOD:
244 		clg->state = TX_IN_GAP_PERIOD;
245 		break;
246 	}
247 
248 	return false;
249 }
250 
251 /* loss_gilb_ell - Gilbert-Elliot model loss generator
252  * Generates losses according to the Gilbert-Elliot loss model or
253  * its special cases  (Gilbert or Simple Gilbert)
254  *
255  * Makes a comparison between random number and the transition
256  * probabilities outgoing from the current state, then decides the
257  * next state. A second random number is extracted and the comparison
258  * with the loss probability of the current state decides if the next
259  * packet will be transmitted or lost.
260  */
261 static bool loss_gilb_ell(struct netem_sched_data *q)
262 {
263 	struct clgstate *clg = &q->clg;
264 
265 	switch (clg->state) {
266 	case GOOD_STATE:
267 		if (prandom_u32() < clg->a1)
268 			clg->state = BAD_STATE;
269 		if (prandom_u32() < clg->a4)
270 			return true;
271 		break;
272 	case BAD_STATE:
273 		if (prandom_u32() < clg->a2)
274 			clg->state = GOOD_STATE;
275 		if (prandom_u32() > clg->a3)
276 			return true;
277 	}
278 
279 	return false;
280 }
281 
282 static bool loss_event(struct netem_sched_data *q)
283 {
284 	switch (q->loss_model) {
285 	case CLG_RANDOM:
286 		/* Random packet drop 0 => none, ~0 => all */
287 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
288 
289 	case CLG_4_STATES:
290 		/* 4state loss model algorithm (used also for GI model)
291 		* Extracts a value from the markov 4 state loss generator,
292 		* if it is 1 drops a packet and if needed writes the event in
293 		* the kernel logs
294 		*/
295 		return loss_4state(q);
296 
297 	case CLG_GILB_ELL:
298 		/* Gilbert-Elliot loss model algorithm
299 		* Extracts a value from the Gilbert-Elliot loss generator,
300 		* if it is 1 drops a packet and if needed writes the event in
301 		* the kernel logs
302 		*/
303 		return loss_gilb_ell(q);
304 	}
305 
306 	return false;	/* not reached */
307 }
308 
309 
310 /* tabledist - return a pseudo-randomly distributed value with mean mu and
311  * std deviation sigma.  Uses table lookup to approximate the desired
312  * distribution, and a uniformly-distributed pseudo-random source.
313  */
314 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
315 				struct crndstate *state,
316 				const struct disttable *dist)
317 {
318 	psched_tdiff_t x;
319 	long t;
320 	u32 rnd;
321 
322 	if (sigma == 0)
323 		return mu;
324 
325 	rnd = get_crandom(state);
326 
327 	/* default uniform distribution */
328 	if (dist == NULL)
329 		return (rnd % (2*sigma)) - sigma + mu;
330 
331 	t = dist->table[rnd % dist->size];
332 	x = (sigma % NETEM_DIST_SCALE) * t;
333 	if (x >= 0)
334 		x += NETEM_DIST_SCALE/2;
335 	else
336 		x -= NETEM_DIST_SCALE/2;
337 
338 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
339 }
340 
341 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
342 {
343 	u64 ticks;
344 
345 	len += q->packet_overhead;
346 
347 	if (q->cell_size) {
348 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
349 
350 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
351 			cells++;
352 		len = cells * (q->cell_size + q->cell_overhead);
353 	}
354 
355 	ticks = (u64)len * NSEC_PER_SEC;
356 
357 	do_div(ticks, q->rate);
358 	return PSCHED_NS2TICKS(ticks);
359 }
360 
361 static void tfifo_reset(struct Qdisc *sch)
362 {
363 	struct netem_sched_data *q = qdisc_priv(sch);
364 	struct rb_node *p = rb_first(&q->t_root);
365 
366 	while (p) {
367 		struct sk_buff *skb = netem_rb_to_skb(p);
368 
369 		p = rb_next(p);
370 		rb_erase(&skb->rbnode, &q->t_root);
371 		rtnl_kfree_skbs(skb, skb);
372 	}
373 }
374 
375 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
376 {
377 	struct netem_sched_data *q = qdisc_priv(sch);
378 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
379 	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
380 
381 	while (*p) {
382 		struct sk_buff *skb;
383 
384 		parent = *p;
385 		skb = netem_rb_to_skb(parent);
386 		if (tnext >= netem_skb_cb(skb)->time_to_send)
387 			p = &parent->rb_right;
388 		else
389 			p = &parent->rb_left;
390 	}
391 	rb_link_node(&nskb->rbnode, parent, p);
392 	rb_insert_color(&nskb->rbnode, &q->t_root);
393 	sch->q.qlen++;
394 }
395 
396 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
397  * when we statistically choose to corrupt one, we instead segment it, returning
398  * the first packet to be corrupted, and re-enqueue the remaining frames
399  */
400 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
401 				     struct sk_buff **to_free)
402 {
403 	struct sk_buff *segs;
404 	netdev_features_t features = netif_skb_features(skb);
405 
406 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
407 
408 	if (IS_ERR_OR_NULL(segs)) {
409 		qdisc_drop(skb, sch, to_free);
410 		return NULL;
411 	}
412 	consume_skb(skb);
413 	return segs;
414 }
415 
416 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
417 {
418 	skb->next = qh->head;
419 
420 	if (!qh->head)
421 		qh->tail = skb;
422 	qh->head = skb;
423 	qh->qlen++;
424 }
425 
426 /*
427  * Insert one skb into qdisc.
428  * Note: parent depends on return value to account for queue length.
429  * 	NET_XMIT_DROP: queue length didn't change.
430  *      NET_XMIT_SUCCESS: one skb was queued.
431  */
432 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
433 			 struct sk_buff **to_free)
434 {
435 	struct netem_sched_data *q = qdisc_priv(sch);
436 	/* We don't fill cb now as skb_unshare() may invalidate it */
437 	struct netem_skb_cb *cb;
438 	struct sk_buff *skb2;
439 	struct sk_buff *segs = NULL;
440 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
441 	int nb = 0;
442 	int count = 1;
443 	int rc = NET_XMIT_SUCCESS;
444 
445 	/* Random duplication */
446 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
447 		++count;
448 
449 	/* Drop packet? */
450 	if (loss_event(q)) {
451 		if (q->ecn && INET_ECN_set_ce(skb))
452 			qdisc_qstats_drop(sch); /* mark packet */
453 		else
454 			--count;
455 	}
456 	if (count == 0) {
457 		qdisc_qstats_drop(sch);
458 		__qdisc_drop(skb, to_free);
459 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
460 	}
461 
462 	/* If a delay is expected, orphan the skb. (orphaning usually takes
463 	 * place at TX completion time, so _before_ the link transit delay)
464 	 */
465 	if (q->latency || q->jitter || q->rate)
466 		skb_orphan_partial(skb);
467 
468 	/*
469 	 * If we need to duplicate packet, then re-insert at top of the
470 	 * qdisc tree, since parent queuer expects that only one
471 	 * skb will be queued.
472 	 */
473 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
474 		struct Qdisc *rootq = qdisc_root(sch);
475 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
476 
477 		q->duplicate = 0;
478 		rootq->enqueue(skb2, rootq, to_free);
479 		q->duplicate = dupsave;
480 	}
481 
482 	/*
483 	 * Randomized packet corruption.
484 	 * Make copy if needed since we are modifying
485 	 * If packet is going to be hardware checksummed, then
486 	 * do it now in software before we mangle it.
487 	 */
488 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
489 		if (skb_is_gso(skb)) {
490 			segs = netem_segment(skb, sch, to_free);
491 			if (!segs)
492 				return NET_XMIT_DROP;
493 		} else {
494 			segs = skb;
495 		}
496 
497 		skb = segs;
498 		segs = segs->next;
499 
500 		skb = skb_unshare(skb, GFP_ATOMIC);
501 		if (unlikely(!skb)) {
502 			qdisc_qstats_drop(sch);
503 			goto finish_segs;
504 		}
505 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
506 		    skb_checksum_help(skb)) {
507 			qdisc_drop(skb, sch, to_free);
508 			goto finish_segs;
509 		}
510 
511 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
512 			1<<(prandom_u32() % 8);
513 	}
514 
515 	if (unlikely(sch->q.qlen >= sch->limit))
516 		return qdisc_drop(skb, sch, to_free);
517 
518 	qdisc_qstats_backlog_inc(sch, skb);
519 
520 	cb = netem_skb_cb(skb);
521 	if (q->gap == 0 ||		/* not doing reordering */
522 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
523 	    q->reorder < get_crandom(&q->reorder_cor)) {
524 		psched_time_t now;
525 		psched_tdiff_t delay;
526 
527 		delay = tabledist(q->latency, q->jitter,
528 				  &q->delay_cor, q->delay_dist);
529 
530 		now = psched_get_time();
531 
532 		if (q->rate) {
533 			struct netem_skb_cb *last = NULL;
534 
535 			if (sch->q.tail)
536 				last = netem_skb_cb(sch->q.tail);
537 			if (q->t_root.rb_node) {
538 				struct sk_buff *t_skb;
539 				struct netem_skb_cb *t_last;
540 
541 				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
542 				t_last = netem_skb_cb(t_skb);
543 				if (!last ||
544 				    t_last->time_to_send > last->time_to_send) {
545 					last = t_last;
546 				}
547 			}
548 
549 			if (last) {
550 				/*
551 				 * Last packet in queue is reference point (now),
552 				 * calculate this time bonus and subtract
553 				 * from delay.
554 				 */
555 				delay -= last->time_to_send - now;
556 				delay = max_t(psched_tdiff_t, 0, delay);
557 				now = last->time_to_send;
558 			}
559 
560 			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
561 		}
562 
563 		cb->time_to_send = now + delay;
564 		++q->counter;
565 		tfifo_enqueue(skb, sch);
566 	} else {
567 		/*
568 		 * Do re-ordering by putting one out of N packets at the front
569 		 * of the queue.
570 		 */
571 		cb->time_to_send = psched_get_time();
572 		q->counter = 0;
573 
574 		netem_enqueue_skb_head(&sch->q, skb);
575 		sch->qstats.requeues++;
576 	}
577 
578 finish_segs:
579 	if (segs) {
580 		while (segs) {
581 			skb2 = segs->next;
582 			segs->next = NULL;
583 			qdisc_skb_cb(segs)->pkt_len = segs->len;
584 			last_len = segs->len;
585 			rc = qdisc_enqueue(segs, sch, to_free);
586 			if (rc != NET_XMIT_SUCCESS) {
587 				if (net_xmit_drop_count(rc))
588 					qdisc_qstats_drop(sch);
589 			} else {
590 				nb++;
591 				len += last_len;
592 			}
593 			segs = skb2;
594 		}
595 		sch->q.qlen += nb;
596 		if (nb > 1)
597 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
598 	}
599 	return NET_XMIT_SUCCESS;
600 }
601 
602 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
603 {
604 	struct netem_sched_data *q = qdisc_priv(sch);
605 	struct sk_buff *skb;
606 	struct rb_node *p;
607 
608 tfifo_dequeue:
609 	skb = __qdisc_dequeue_head(&sch->q);
610 	if (skb) {
611 		qdisc_qstats_backlog_dec(sch, skb);
612 deliver:
613 		qdisc_bstats_update(sch, skb);
614 		return skb;
615 	}
616 	p = rb_first(&q->t_root);
617 	if (p) {
618 		psched_time_t time_to_send;
619 
620 		skb = netem_rb_to_skb(p);
621 
622 		/* if more time remaining? */
623 		time_to_send = netem_skb_cb(skb)->time_to_send;
624 		if (time_to_send <= psched_get_time()) {
625 			rb_erase(p, &q->t_root);
626 
627 			sch->q.qlen--;
628 			qdisc_qstats_backlog_dec(sch, skb);
629 			skb->next = NULL;
630 			skb->prev = NULL;
631 			/* skb->dev shares skb->rbnode area,
632 			 * we need to restore its value.
633 			 */
634 			skb->dev = qdisc_dev(sch);
635 
636 #ifdef CONFIG_NET_CLS_ACT
637 			/*
638 			 * If it's at ingress let's pretend the delay is
639 			 * from the network (tstamp will be updated).
640 			 */
641 			if (skb->tc_redirected && skb->tc_from_ingress)
642 				skb->tstamp = 0;
643 #endif
644 
645 			if (q->qdisc) {
646 				unsigned int pkt_len = qdisc_pkt_len(skb);
647 				struct sk_buff *to_free = NULL;
648 				int err;
649 
650 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
651 				kfree_skb_list(to_free);
652 				if (err != NET_XMIT_SUCCESS &&
653 				    net_xmit_drop_count(err)) {
654 					qdisc_qstats_drop(sch);
655 					qdisc_tree_reduce_backlog(sch, 1,
656 								  pkt_len);
657 				}
658 				goto tfifo_dequeue;
659 			}
660 			goto deliver;
661 		}
662 
663 		if (q->qdisc) {
664 			skb = q->qdisc->ops->dequeue(q->qdisc);
665 			if (skb)
666 				goto deliver;
667 		}
668 		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
669 	}
670 
671 	if (q->qdisc) {
672 		skb = q->qdisc->ops->dequeue(q->qdisc);
673 		if (skb)
674 			goto deliver;
675 	}
676 	return NULL;
677 }
678 
679 static void netem_reset(struct Qdisc *sch)
680 {
681 	struct netem_sched_data *q = qdisc_priv(sch);
682 
683 	qdisc_reset_queue(sch);
684 	tfifo_reset(sch);
685 	if (q->qdisc)
686 		qdisc_reset(q->qdisc);
687 	qdisc_watchdog_cancel(&q->watchdog);
688 }
689 
690 static void dist_free(struct disttable *d)
691 {
692 	kvfree(d);
693 }
694 
695 /*
696  * Distribution data is a variable size payload containing
697  * signed 16 bit values.
698  */
699 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
700 {
701 	struct netem_sched_data *q = qdisc_priv(sch);
702 	size_t n = nla_len(attr)/sizeof(__s16);
703 	const __s16 *data = nla_data(attr);
704 	spinlock_t *root_lock;
705 	struct disttable *d;
706 	int i;
707 
708 	if (n > NETEM_DIST_MAX)
709 		return -EINVAL;
710 
711 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
712 	if (!d)
713 		return -ENOMEM;
714 
715 	d->size = n;
716 	for (i = 0; i < n; i++)
717 		d->table[i] = data[i];
718 
719 	root_lock = qdisc_root_sleeping_lock(sch);
720 
721 	spin_lock_bh(root_lock);
722 	swap(q->delay_dist, d);
723 	spin_unlock_bh(root_lock);
724 
725 	dist_free(d);
726 	return 0;
727 }
728 
729 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
730 {
731 	const struct tc_netem_corr *c = nla_data(attr);
732 
733 	init_crandom(&q->delay_cor, c->delay_corr);
734 	init_crandom(&q->loss_cor, c->loss_corr);
735 	init_crandom(&q->dup_cor, c->dup_corr);
736 }
737 
738 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
739 {
740 	const struct tc_netem_reorder *r = nla_data(attr);
741 
742 	q->reorder = r->probability;
743 	init_crandom(&q->reorder_cor, r->correlation);
744 }
745 
746 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
747 {
748 	const struct tc_netem_corrupt *r = nla_data(attr);
749 
750 	q->corrupt = r->probability;
751 	init_crandom(&q->corrupt_cor, r->correlation);
752 }
753 
754 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
755 {
756 	const struct tc_netem_rate *r = nla_data(attr);
757 
758 	q->rate = r->rate;
759 	q->packet_overhead = r->packet_overhead;
760 	q->cell_size = r->cell_size;
761 	q->cell_overhead = r->cell_overhead;
762 	if (q->cell_size)
763 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
764 	else
765 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
766 }
767 
768 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
769 {
770 	const struct nlattr *la;
771 	int rem;
772 
773 	nla_for_each_nested(la, attr, rem) {
774 		u16 type = nla_type(la);
775 
776 		switch (type) {
777 		case NETEM_LOSS_GI: {
778 			const struct tc_netem_gimodel *gi = nla_data(la);
779 
780 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
781 				pr_info("netem: incorrect gi model size\n");
782 				return -EINVAL;
783 			}
784 
785 			q->loss_model = CLG_4_STATES;
786 
787 			q->clg.state = TX_IN_GAP_PERIOD;
788 			q->clg.a1 = gi->p13;
789 			q->clg.a2 = gi->p31;
790 			q->clg.a3 = gi->p32;
791 			q->clg.a4 = gi->p14;
792 			q->clg.a5 = gi->p23;
793 			break;
794 		}
795 
796 		case NETEM_LOSS_GE: {
797 			const struct tc_netem_gemodel *ge = nla_data(la);
798 
799 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
800 				pr_info("netem: incorrect ge model size\n");
801 				return -EINVAL;
802 			}
803 
804 			q->loss_model = CLG_GILB_ELL;
805 			q->clg.state = GOOD_STATE;
806 			q->clg.a1 = ge->p;
807 			q->clg.a2 = ge->r;
808 			q->clg.a3 = ge->h;
809 			q->clg.a4 = ge->k1;
810 			break;
811 		}
812 
813 		default:
814 			pr_info("netem: unknown loss type %u\n", type);
815 			return -EINVAL;
816 		}
817 	}
818 
819 	return 0;
820 }
821 
822 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
823 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
824 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
825 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
826 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
827 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
828 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
829 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
830 };
831 
832 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
833 		      const struct nla_policy *policy, int len)
834 {
835 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
836 
837 	if (nested_len < 0) {
838 		pr_info("netem: invalid attributes len %d\n", nested_len);
839 		return -EINVAL;
840 	}
841 
842 	if (nested_len >= nla_attr_size(0))
843 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
844 				 nested_len, policy, NULL);
845 
846 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
847 	return 0;
848 }
849 
850 /* Parse netlink message to set options */
851 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
852 {
853 	struct netem_sched_data *q = qdisc_priv(sch);
854 	struct nlattr *tb[TCA_NETEM_MAX + 1];
855 	struct tc_netem_qopt *qopt;
856 	struct clgstate old_clg;
857 	int old_loss_model = CLG_RANDOM;
858 	int ret;
859 
860 	if (opt == NULL)
861 		return -EINVAL;
862 
863 	qopt = nla_data(opt);
864 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
865 	if (ret < 0)
866 		return ret;
867 
868 	/* backup q->clg and q->loss_model */
869 	old_clg = q->clg;
870 	old_loss_model = q->loss_model;
871 
872 	if (tb[TCA_NETEM_LOSS]) {
873 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
874 		if (ret) {
875 			q->loss_model = old_loss_model;
876 			return ret;
877 		}
878 	} else {
879 		q->loss_model = CLG_RANDOM;
880 	}
881 
882 	if (tb[TCA_NETEM_DELAY_DIST]) {
883 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
884 		if (ret) {
885 			/* recover clg and loss_model, in case of
886 			 * q->clg and q->loss_model were modified
887 			 * in get_loss_clg()
888 			 */
889 			q->clg = old_clg;
890 			q->loss_model = old_loss_model;
891 			return ret;
892 		}
893 	}
894 
895 	sch->limit = qopt->limit;
896 
897 	q->latency = qopt->latency;
898 	q->jitter = qopt->jitter;
899 	q->limit = qopt->limit;
900 	q->gap = qopt->gap;
901 	q->counter = 0;
902 	q->loss = qopt->loss;
903 	q->duplicate = qopt->duplicate;
904 
905 	/* for compatibility with earlier versions.
906 	 * if gap is set, need to assume 100% probability
907 	 */
908 	if (q->gap)
909 		q->reorder = ~0;
910 
911 	if (tb[TCA_NETEM_CORR])
912 		get_correlation(q, tb[TCA_NETEM_CORR]);
913 
914 	if (tb[TCA_NETEM_REORDER])
915 		get_reorder(q, tb[TCA_NETEM_REORDER]);
916 
917 	if (tb[TCA_NETEM_CORRUPT])
918 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
919 
920 	if (tb[TCA_NETEM_RATE])
921 		get_rate(q, tb[TCA_NETEM_RATE]);
922 
923 	if (tb[TCA_NETEM_RATE64])
924 		q->rate = max_t(u64, q->rate,
925 				nla_get_u64(tb[TCA_NETEM_RATE64]));
926 
927 	if (tb[TCA_NETEM_ECN])
928 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
929 
930 	return ret;
931 }
932 
933 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
934 {
935 	struct netem_sched_data *q = qdisc_priv(sch);
936 	int ret;
937 
938 	qdisc_watchdog_init(&q->watchdog, sch);
939 
940 	if (!opt)
941 		return -EINVAL;
942 
943 	q->loss_model = CLG_RANDOM;
944 	ret = netem_change(sch, opt);
945 	if (ret)
946 		pr_info("netem: change failed\n");
947 	return ret;
948 }
949 
950 static void netem_destroy(struct Qdisc *sch)
951 {
952 	struct netem_sched_data *q = qdisc_priv(sch);
953 
954 	qdisc_watchdog_cancel(&q->watchdog);
955 	if (q->qdisc)
956 		qdisc_destroy(q->qdisc);
957 	dist_free(q->delay_dist);
958 }
959 
960 static int dump_loss_model(const struct netem_sched_data *q,
961 			   struct sk_buff *skb)
962 {
963 	struct nlattr *nest;
964 
965 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
966 	if (nest == NULL)
967 		goto nla_put_failure;
968 
969 	switch (q->loss_model) {
970 	case CLG_RANDOM:
971 		/* legacy loss model */
972 		nla_nest_cancel(skb, nest);
973 		return 0;	/* no data */
974 
975 	case CLG_4_STATES: {
976 		struct tc_netem_gimodel gi = {
977 			.p13 = q->clg.a1,
978 			.p31 = q->clg.a2,
979 			.p32 = q->clg.a3,
980 			.p14 = q->clg.a4,
981 			.p23 = q->clg.a5,
982 		};
983 
984 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
985 			goto nla_put_failure;
986 		break;
987 	}
988 	case CLG_GILB_ELL: {
989 		struct tc_netem_gemodel ge = {
990 			.p = q->clg.a1,
991 			.r = q->clg.a2,
992 			.h = q->clg.a3,
993 			.k1 = q->clg.a4,
994 		};
995 
996 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
997 			goto nla_put_failure;
998 		break;
999 	}
1000 	}
1001 
1002 	nla_nest_end(skb, nest);
1003 	return 0;
1004 
1005 nla_put_failure:
1006 	nla_nest_cancel(skb, nest);
1007 	return -1;
1008 }
1009 
1010 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1011 {
1012 	const struct netem_sched_data *q = qdisc_priv(sch);
1013 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1014 	struct tc_netem_qopt qopt;
1015 	struct tc_netem_corr cor;
1016 	struct tc_netem_reorder reorder;
1017 	struct tc_netem_corrupt corrupt;
1018 	struct tc_netem_rate rate;
1019 
1020 	qopt.latency = q->latency;
1021 	qopt.jitter = q->jitter;
1022 	qopt.limit = q->limit;
1023 	qopt.loss = q->loss;
1024 	qopt.gap = q->gap;
1025 	qopt.duplicate = q->duplicate;
1026 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1027 		goto nla_put_failure;
1028 
1029 	cor.delay_corr = q->delay_cor.rho;
1030 	cor.loss_corr = q->loss_cor.rho;
1031 	cor.dup_corr = q->dup_cor.rho;
1032 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1033 		goto nla_put_failure;
1034 
1035 	reorder.probability = q->reorder;
1036 	reorder.correlation = q->reorder_cor.rho;
1037 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1038 		goto nla_put_failure;
1039 
1040 	corrupt.probability = q->corrupt;
1041 	corrupt.correlation = q->corrupt_cor.rho;
1042 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1043 		goto nla_put_failure;
1044 
1045 	if (q->rate >= (1ULL << 32)) {
1046 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1047 				      TCA_NETEM_PAD))
1048 			goto nla_put_failure;
1049 		rate.rate = ~0U;
1050 	} else {
1051 		rate.rate = q->rate;
1052 	}
1053 	rate.packet_overhead = q->packet_overhead;
1054 	rate.cell_size = q->cell_size;
1055 	rate.cell_overhead = q->cell_overhead;
1056 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1057 		goto nla_put_failure;
1058 
1059 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1060 		goto nla_put_failure;
1061 
1062 	if (dump_loss_model(q, skb) != 0)
1063 		goto nla_put_failure;
1064 
1065 	return nla_nest_end(skb, nla);
1066 
1067 nla_put_failure:
1068 	nlmsg_trim(skb, nla);
1069 	return -1;
1070 }
1071 
1072 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1073 			  struct sk_buff *skb, struct tcmsg *tcm)
1074 {
1075 	struct netem_sched_data *q = qdisc_priv(sch);
1076 
1077 	if (cl != 1 || !q->qdisc) 	/* only one class */
1078 		return -ENOENT;
1079 
1080 	tcm->tcm_handle |= TC_H_MIN(1);
1081 	tcm->tcm_info = q->qdisc->handle;
1082 
1083 	return 0;
1084 }
1085 
1086 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1087 		     struct Qdisc **old)
1088 {
1089 	struct netem_sched_data *q = qdisc_priv(sch);
1090 
1091 	*old = qdisc_replace(sch, new, &q->qdisc);
1092 	return 0;
1093 }
1094 
1095 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1096 {
1097 	struct netem_sched_data *q = qdisc_priv(sch);
1098 	return q->qdisc;
1099 }
1100 
1101 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1102 {
1103 	return 1;
1104 }
1105 
1106 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1107 {
1108 	if (!walker->stop) {
1109 		if (walker->count >= walker->skip)
1110 			if (walker->fn(sch, 1, walker) < 0) {
1111 				walker->stop = 1;
1112 				return;
1113 			}
1114 		walker->count++;
1115 	}
1116 }
1117 
1118 static const struct Qdisc_class_ops netem_class_ops = {
1119 	.graft		=	netem_graft,
1120 	.leaf		=	netem_leaf,
1121 	.find		=	netem_find,
1122 	.walk		=	netem_walk,
1123 	.dump		=	netem_dump_class,
1124 };
1125 
1126 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1127 	.id		=	"netem",
1128 	.cl_ops		=	&netem_class_ops,
1129 	.priv_size	=	sizeof(struct netem_sched_data),
1130 	.enqueue	=	netem_enqueue,
1131 	.dequeue	=	netem_dequeue,
1132 	.peek		=	qdisc_peek_dequeued,
1133 	.init		=	netem_init,
1134 	.reset		=	netem_reset,
1135 	.destroy	=	netem_destroy,
1136 	.change		=	netem_change,
1137 	.dump		=	netem_dump,
1138 	.owner		=	THIS_MODULE,
1139 };
1140 
1141 
1142 static int __init netem_module_init(void)
1143 {
1144 	pr_info("netem: version " VERSION "\n");
1145 	return register_qdisc(&netem_qdisc_ops);
1146 }
1147 static void __exit netem_module_exit(void)
1148 {
1149 	unregister_qdisc(&netem_qdisc_ops);
1150 }
1151 module_init(netem_module_init)
1152 module_exit(netem_module_exit)
1153 MODULE_LICENSE("GPL");
1154