xref: /openbmc/linux/net/sched/sch_netem.c (revision 9fb29c73)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct disttable {
72 	u32  size;
73 	s16 table[0];
74 };
75 
76 struct netem_sched_data {
77 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
78 	struct rb_root t_root;
79 
80 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
81 	struct sk_buff	*t_head;
82 	struct sk_buff	*t_tail;
83 
84 	/* optional qdisc for classful handling (NULL at netem init) */
85 	struct Qdisc	*qdisc;
86 
87 	struct qdisc_watchdog watchdog;
88 
89 	s64 latency;
90 	s64 jitter;
91 
92 	u32 loss;
93 	u32 ecn;
94 	u32 limit;
95 	u32 counter;
96 	u32 gap;
97 	u32 duplicate;
98 	u32 reorder;
99 	u32 corrupt;
100 	u64 rate;
101 	s32 packet_overhead;
102 	u32 cell_size;
103 	struct reciprocal_value cell_size_reciprocal;
104 	s32 cell_overhead;
105 
106 	struct crndstate {
107 		u32 last;
108 		u32 rho;
109 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
110 
111 	struct disttable *delay_dist;
112 
113 	enum  {
114 		CLG_RANDOM,
115 		CLG_4_STATES,
116 		CLG_GILB_ELL,
117 	} loss_model;
118 
119 	enum {
120 		TX_IN_GAP_PERIOD = 1,
121 		TX_IN_BURST_PERIOD,
122 		LOST_IN_GAP_PERIOD,
123 		LOST_IN_BURST_PERIOD,
124 	} _4_state_model;
125 
126 	enum {
127 		GOOD_STATE = 1,
128 		BAD_STATE,
129 	} GE_state_model;
130 
131 	/* Correlated Loss Generation models */
132 	struct clgstate {
133 		/* state of the Markov chain */
134 		u8 state;
135 
136 		/* 4-states and Gilbert-Elliot models */
137 		u32 a1;	/* p13 for 4-states or p for GE */
138 		u32 a2;	/* p31 for 4-states or r for GE */
139 		u32 a3;	/* p32 for 4-states or h for GE */
140 		u32 a4;	/* p14 for 4-states or 1-k for GE */
141 		u32 a5; /* p23 used only in 4-states */
142 	} clg;
143 
144 	struct tc_netem_slot slot_config;
145 	struct slotstate {
146 		u64 slot_next;
147 		s32 packets_left;
148 		s32 bytes_left;
149 	} slot;
150 
151 	struct disttable *slot_dist;
152 };
153 
154 /* Time stamp put into socket buffer control block
155  * Only valid when skbs are in our internal t(ime)fifo queue.
156  *
157  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
158  * and skb->next & skb->prev are scratch space for a qdisc,
159  * we save skb->tstamp value in skb->cb[] before destroying it.
160  */
161 struct netem_skb_cb {
162 	u64	        time_to_send;
163 };
164 
165 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
166 {
167 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
168 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
169 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
170 }
171 
172 /* init_crandom - initialize correlated random number generator
173  * Use entropy source for initial seed.
174  */
175 static void init_crandom(struct crndstate *state, unsigned long rho)
176 {
177 	state->rho = rho;
178 	state->last = prandom_u32();
179 }
180 
181 /* get_crandom - correlated random number generator
182  * Next number depends on last value.
183  * rho is scaled to avoid floating point.
184  */
185 static u32 get_crandom(struct crndstate *state)
186 {
187 	u64 value, rho;
188 	unsigned long answer;
189 
190 	if (!state || state->rho == 0)	/* no correlation */
191 		return prandom_u32();
192 
193 	value = prandom_u32();
194 	rho = (u64)state->rho + 1;
195 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
196 	state->last = answer;
197 	return answer;
198 }
199 
200 /* loss_4state - 4-state model loss generator
201  * Generates losses according to the 4-state Markov chain adopted in
202  * the GI (General and Intuitive) loss model.
203  */
204 static bool loss_4state(struct netem_sched_data *q)
205 {
206 	struct clgstate *clg = &q->clg;
207 	u32 rnd = prandom_u32();
208 
209 	/*
210 	 * Makes a comparison between rnd and the transition
211 	 * probabilities outgoing from the current state, then decides the
212 	 * next state and if the next packet has to be transmitted or lost.
213 	 * The four states correspond to:
214 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
215 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
216 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
217 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
218 	 */
219 	switch (clg->state) {
220 	case TX_IN_GAP_PERIOD:
221 		if (rnd < clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
225 			clg->state = LOST_IN_GAP_PERIOD;
226 			return true;
227 		} else if (clg->a1 + clg->a4 < rnd) {
228 			clg->state = TX_IN_GAP_PERIOD;
229 		}
230 
231 		break;
232 	case TX_IN_BURST_PERIOD:
233 		if (rnd < clg->a5) {
234 			clg->state = LOST_IN_GAP_PERIOD;
235 			return true;
236 		} else {
237 			clg->state = TX_IN_BURST_PERIOD;
238 		}
239 
240 		break;
241 	case LOST_IN_GAP_PERIOD:
242 		if (rnd < clg->a3)
243 			clg->state = TX_IN_BURST_PERIOD;
244 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
245 			clg->state = TX_IN_GAP_PERIOD;
246 		} else if (clg->a2 + clg->a3 < rnd) {
247 			clg->state = LOST_IN_GAP_PERIOD;
248 			return true;
249 		}
250 		break;
251 	case LOST_IN_BURST_PERIOD:
252 		clg->state = TX_IN_GAP_PERIOD;
253 		break;
254 	}
255 
256 	return false;
257 }
258 
259 /* loss_gilb_ell - Gilbert-Elliot model loss generator
260  * Generates losses according to the Gilbert-Elliot loss model or
261  * its special cases  (Gilbert or Simple Gilbert)
262  *
263  * Makes a comparison between random number and the transition
264  * probabilities outgoing from the current state, then decides the
265  * next state. A second random number is extracted and the comparison
266  * with the loss probability of the current state decides if the next
267  * packet will be transmitted or lost.
268  */
269 static bool loss_gilb_ell(struct netem_sched_data *q)
270 {
271 	struct clgstate *clg = &q->clg;
272 
273 	switch (clg->state) {
274 	case GOOD_STATE:
275 		if (prandom_u32() < clg->a1)
276 			clg->state = BAD_STATE;
277 		if (prandom_u32() < clg->a4)
278 			return true;
279 		break;
280 	case BAD_STATE:
281 		if (prandom_u32() < clg->a2)
282 			clg->state = GOOD_STATE;
283 		if (prandom_u32() > clg->a3)
284 			return true;
285 	}
286 
287 	return false;
288 }
289 
290 static bool loss_event(struct netem_sched_data *q)
291 {
292 	switch (q->loss_model) {
293 	case CLG_RANDOM:
294 		/* Random packet drop 0 => none, ~0 => all */
295 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
296 
297 	case CLG_4_STATES:
298 		/* 4state loss model algorithm (used also for GI model)
299 		* Extracts a value from the markov 4 state loss generator,
300 		* if it is 1 drops a packet and if needed writes the event in
301 		* the kernel logs
302 		*/
303 		return loss_4state(q);
304 
305 	case CLG_GILB_ELL:
306 		/* Gilbert-Elliot loss model algorithm
307 		* Extracts a value from the Gilbert-Elliot loss generator,
308 		* if it is 1 drops a packet and if needed writes the event in
309 		* the kernel logs
310 		*/
311 		return loss_gilb_ell(q);
312 	}
313 
314 	return false;	/* not reached */
315 }
316 
317 
318 /* tabledist - return a pseudo-randomly distributed value with mean mu and
319  * std deviation sigma.  Uses table lookup to approximate the desired
320  * distribution, and a uniformly-distributed pseudo-random source.
321  */
322 static s64 tabledist(s64 mu, s32 sigma,
323 		     struct crndstate *state,
324 		     const struct disttable *dist)
325 {
326 	s64 x;
327 	long t;
328 	u32 rnd;
329 
330 	if (sigma == 0)
331 		return mu;
332 
333 	rnd = get_crandom(state);
334 
335 	/* default uniform distribution */
336 	if (dist == NULL)
337 		return ((rnd % (2 * sigma)) + mu) - sigma;
338 
339 	t = dist->table[rnd % dist->size];
340 	x = (sigma % NETEM_DIST_SCALE) * t;
341 	if (x >= 0)
342 		x += NETEM_DIST_SCALE/2;
343 	else
344 		x -= NETEM_DIST_SCALE/2;
345 
346 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
347 }
348 
349 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
350 {
351 	len += q->packet_overhead;
352 
353 	if (q->cell_size) {
354 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
355 
356 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
357 			cells++;
358 		len = cells * (q->cell_size + q->cell_overhead);
359 	}
360 
361 	return div64_u64(len * NSEC_PER_SEC, q->rate);
362 }
363 
364 static void tfifo_reset(struct Qdisc *sch)
365 {
366 	struct netem_sched_data *q = qdisc_priv(sch);
367 	struct rb_node *p = rb_first(&q->t_root);
368 
369 	while (p) {
370 		struct sk_buff *skb = rb_to_skb(p);
371 
372 		p = rb_next(p);
373 		rb_erase(&skb->rbnode, &q->t_root);
374 		rtnl_kfree_skbs(skb, skb);
375 	}
376 
377 	rtnl_kfree_skbs(q->t_head, q->t_tail);
378 	q->t_head = NULL;
379 	q->t_tail = NULL;
380 }
381 
382 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
383 {
384 	struct netem_sched_data *q = qdisc_priv(sch);
385 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
386 
387 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
388 		if (q->t_tail)
389 			q->t_tail->next = nskb;
390 		else
391 			q->t_head = nskb;
392 		q->t_tail = nskb;
393 	} else {
394 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
395 
396 		while (*p) {
397 			struct sk_buff *skb;
398 
399 			parent = *p;
400 			skb = rb_to_skb(parent);
401 			if (tnext >= netem_skb_cb(skb)->time_to_send)
402 				p = &parent->rb_right;
403 			else
404 				p = &parent->rb_left;
405 		}
406 		rb_link_node(&nskb->rbnode, parent, p);
407 		rb_insert_color(&nskb->rbnode, &q->t_root);
408 	}
409 	sch->q.qlen++;
410 }
411 
412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
413  * when we statistically choose to corrupt one, we instead segment it, returning
414  * the first packet to be corrupted, and re-enqueue the remaining frames
415  */
416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
417 				     struct sk_buff **to_free)
418 {
419 	struct sk_buff *segs;
420 	netdev_features_t features = netif_skb_features(skb);
421 
422 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
423 
424 	if (IS_ERR_OR_NULL(segs)) {
425 		qdisc_drop(skb, sch, to_free);
426 		return NULL;
427 	}
428 	consume_skb(skb);
429 	return segs;
430 }
431 
432 /*
433  * Insert one skb into qdisc.
434  * Note: parent depends on return value to account for queue length.
435  * 	NET_XMIT_DROP: queue length didn't change.
436  *      NET_XMIT_SUCCESS: one skb was queued.
437  */
438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
439 			 struct sk_buff **to_free)
440 {
441 	struct netem_sched_data *q = qdisc_priv(sch);
442 	/* We don't fill cb now as skb_unshare() may invalidate it */
443 	struct netem_skb_cb *cb;
444 	struct sk_buff *skb2;
445 	struct sk_buff *segs = NULL;
446 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
447 	int nb = 0;
448 	int count = 1;
449 	int rc = NET_XMIT_SUCCESS;
450 
451 	/* Do not fool qdisc_drop_all() */
452 	skb->prev = NULL;
453 
454 	/* Random duplication */
455 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
456 		++count;
457 
458 	/* Drop packet? */
459 	if (loss_event(q)) {
460 		if (q->ecn && INET_ECN_set_ce(skb))
461 			qdisc_qstats_drop(sch); /* mark packet */
462 		else
463 			--count;
464 	}
465 	if (count == 0) {
466 		qdisc_qstats_drop(sch);
467 		__qdisc_drop(skb, to_free);
468 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
469 	}
470 
471 	/* If a delay is expected, orphan the skb. (orphaning usually takes
472 	 * place at TX completion time, so _before_ the link transit delay)
473 	 */
474 	if (q->latency || q->jitter || q->rate)
475 		skb_orphan_partial(skb);
476 
477 	/*
478 	 * If we need to duplicate packet, then re-insert at top of the
479 	 * qdisc tree, since parent queuer expects that only one
480 	 * skb will be queued.
481 	 */
482 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
483 		struct Qdisc *rootq = qdisc_root(sch);
484 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
485 
486 		q->duplicate = 0;
487 		rootq->enqueue(skb2, rootq, to_free);
488 		q->duplicate = dupsave;
489 	}
490 
491 	/*
492 	 * Randomized packet corruption.
493 	 * Make copy if needed since we are modifying
494 	 * If packet is going to be hardware checksummed, then
495 	 * do it now in software before we mangle it.
496 	 */
497 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
498 		if (skb_is_gso(skb)) {
499 			segs = netem_segment(skb, sch, to_free);
500 			if (!segs)
501 				return NET_XMIT_DROP;
502 		} else {
503 			segs = skb;
504 		}
505 
506 		skb = segs;
507 		segs = segs->next;
508 
509 		skb = skb_unshare(skb, GFP_ATOMIC);
510 		if (unlikely(!skb)) {
511 			qdisc_qstats_drop(sch);
512 			goto finish_segs;
513 		}
514 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
515 		    skb_checksum_help(skb)) {
516 			qdisc_drop(skb, sch, to_free);
517 			goto finish_segs;
518 		}
519 
520 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
521 			1<<(prandom_u32() % 8);
522 	}
523 
524 	if (unlikely(sch->q.qlen >= sch->limit))
525 		return qdisc_drop_all(skb, sch, to_free);
526 
527 	qdisc_qstats_backlog_inc(sch, skb);
528 
529 	cb = netem_skb_cb(skb);
530 	if (q->gap == 0 ||		/* not doing reordering */
531 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
532 	    q->reorder < get_crandom(&q->reorder_cor)) {
533 		u64 now;
534 		s64 delay;
535 
536 		delay = tabledist(q->latency, q->jitter,
537 				  &q->delay_cor, q->delay_dist);
538 
539 		now = ktime_get_ns();
540 
541 		if (q->rate) {
542 			struct netem_skb_cb *last = NULL;
543 
544 			if (sch->q.tail)
545 				last = netem_skb_cb(sch->q.tail);
546 			if (q->t_root.rb_node) {
547 				struct sk_buff *t_skb;
548 				struct netem_skb_cb *t_last;
549 
550 				t_skb = skb_rb_last(&q->t_root);
551 				t_last = netem_skb_cb(t_skb);
552 				if (!last ||
553 				    t_last->time_to_send > last->time_to_send)
554 					last = t_last;
555 			}
556 			if (q->t_tail) {
557 				struct netem_skb_cb *t_last =
558 					netem_skb_cb(q->t_tail);
559 
560 				if (!last ||
561 				    t_last->time_to_send > last->time_to_send)
562 					last = t_last;
563 			}
564 
565 			if (last) {
566 				/*
567 				 * Last packet in queue is reference point (now),
568 				 * calculate this time bonus and subtract
569 				 * from delay.
570 				 */
571 				delay -= last->time_to_send - now;
572 				delay = max_t(s64, 0, delay);
573 				now = last->time_to_send;
574 			}
575 
576 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
577 		}
578 
579 		cb->time_to_send = now + delay;
580 		++q->counter;
581 		tfifo_enqueue(skb, sch);
582 	} else {
583 		/*
584 		 * Do re-ordering by putting one out of N packets at the front
585 		 * of the queue.
586 		 */
587 		cb->time_to_send = ktime_get_ns();
588 		q->counter = 0;
589 
590 		__qdisc_enqueue_head(skb, &sch->q);
591 		sch->qstats.requeues++;
592 	}
593 
594 finish_segs:
595 	if (segs) {
596 		while (segs) {
597 			skb2 = segs->next;
598 			skb_mark_not_on_list(segs);
599 			qdisc_skb_cb(segs)->pkt_len = segs->len;
600 			last_len = segs->len;
601 			rc = qdisc_enqueue(segs, sch, to_free);
602 			if (rc != NET_XMIT_SUCCESS) {
603 				if (net_xmit_drop_count(rc))
604 					qdisc_qstats_drop(sch);
605 			} else {
606 				nb++;
607 				len += last_len;
608 			}
609 			segs = skb2;
610 		}
611 		sch->q.qlen += nb;
612 		if (nb > 1)
613 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
614 	}
615 	return NET_XMIT_SUCCESS;
616 }
617 
618 /* Delay the next round with a new future slot with a
619  * correct number of bytes and packets.
620  */
621 
622 static void get_slot_next(struct netem_sched_data *q, u64 now)
623 {
624 	s64 next_delay;
625 
626 	if (!q->slot_dist)
627 		next_delay = q->slot_config.min_delay +
628 				(prandom_u32() *
629 				 (q->slot_config.max_delay -
630 				  q->slot_config.min_delay) >> 32);
631 	else
632 		next_delay = tabledist(q->slot_config.dist_delay,
633 				       (s32)(q->slot_config.dist_jitter),
634 				       NULL, q->slot_dist);
635 
636 	q->slot.slot_next = now + next_delay;
637 	q->slot.packets_left = q->slot_config.max_packets;
638 	q->slot.bytes_left = q->slot_config.max_bytes;
639 }
640 
641 static struct sk_buff *netem_peek(struct netem_sched_data *q)
642 {
643 	struct sk_buff *skb = skb_rb_first(&q->t_root);
644 	u64 t1, t2;
645 
646 	if (!skb)
647 		return q->t_head;
648 	if (!q->t_head)
649 		return skb;
650 
651 	t1 = netem_skb_cb(skb)->time_to_send;
652 	t2 = netem_skb_cb(q->t_head)->time_to_send;
653 	if (t1 < t2)
654 		return skb;
655 	return q->t_head;
656 }
657 
658 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
659 {
660 	if (skb == q->t_head) {
661 		q->t_head = skb->next;
662 		if (!q->t_head)
663 			q->t_tail = NULL;
664 	} else {
665 		rb_erase(&skb->rbnode, &q->t_root);
666 	}
667 }
668 
669 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
670 {
671 	struct netem_sched_data *q = qdisc_priv(sch);
672 	struct sk_buff *skb;
673 
674 tfifo_dequeue:
675 	skb = __qdisc_dequeue_head(&sch->q);
676 	if (skb) {
677 		qdisc_qstats_backlog_dec(sch, skb);
678 deliver:
679 		qdisc_bstats_update(sch, skb);
680 		return skb;
681 	}
682 	skb = netem_peek(q);
683 	if (skb) {
684 		u64 time_to_send;
685 		u64 now = ktime_get_ns();
686 
687 		/* if more time remaining? */
688 		time_to_send = netem_skb_cb(skb)->time_to_send;
689 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
690 			get_slot_next(q, now);
691 
692 		if (time_to_send <= now && q->slot.slot_next <= now) {
693 			netem_erase_head(q, skb);
694 			sch->q.qlen--;
695 			qdisc_qstats_backlog_dec(sch, skb);
696 			skb->next = NULL;
697 			skb->prev = NULL;
698 			/* skb->dev shares skb->rbnode area,
699 			 * we need to restore its value.
700 			 */
701 			skb->dev = qdisc_dev(sch);
702 
703 			if (q->slot.slot_next) {
704 				q->slot.packets_left--;
705 				q->slot.bytes_left -= qdisc_pkt_len(skb);
706 				if (q->slot.packets_left <= 0 ||
707 				    q->slot.bytes_left <= 0)
708 					get_slot_next(q, now);
709 			}
710 
711 			if (q->qdisc) {
712 				unsigned int pkt_len = qdisc_pkt_len(skb);
713 				struct sk_buff *to_free = NULL;
714 				int err;
715 
716 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
717 				kfree_skb_list(to_free);
718 				if (err != NET_XMIT_SUCCESS &&
719 				    net_xmit_drop_count(err)) {
720 					qdisc_qstats_drop(sch);
721 					qdisc_tree_reduce_backlog(sch, 1,
722 								  pkt_len);
723 				}
724 				goto tfifo_dequeue;
725 			}
726 			goto deliver;
727 		}
728 
729 		if (q->qdisc) {
730 			skb = q->qdisc->ops->dequeue(q->qdisc);
731 			if (skb)
732 				goto deliver;
733 		}
734 
735 		qdisc_watchdog_schedule_ns(&q->watchdog,
736 					   max(time_to_send,
737 					       q->slot.slot_next));
738 	}
739 
740 	if (q->qdisc) {
741 		skb = q->qdisc->ops->dequeue(q->qdisc);
742 		if (skb)
743 			goto deliver;
744 	}
745 	return NULL;
746 }
747 
748 static void netem_reset(struct Qdisc *sch)
749 {
750 	struct netem_sched_data *q = qdisc_priv(sch);
751 
752 	qdisc_reset_queue(sch);
753 	tfifo_reset(sch);
754 	if (q->qdisc)
755 		qdisc_reset(q->qdisc);
756 	qdisc_watchdog_cancel(&q->watchdog);
757 }
758 
759 static void dist_free(struct disttable *d)
760 {
761 	kvfree(d);
762 }
763 
764 /*
765  * Distribution data is a variable size payload containing
766  * signed 16 bit values.
767  */
768 
769 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
770 			  const struct nlattr *attr)
771 {
772 	size_t n = nla_len(attr)/sizeof(__s16);
773 	const __s16 *data = nla_data(attr);
774 	spinlock_t *root_lock;
775 	struct disttable *d;
776 	int i;
777 
778 	if (n > NETEM_DIST_MAX)
779 		return -EINVAL;
780 
781 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
782 	if (!d)
783 		return -ENOMEM;
784 
785 	d->size = n;
786 	for (i = 0; i < n; i++)
787 		d->table[i] = data[i];
788 
789 	root_lock = qdisc_root_sleeping_lock(sch);
790 
791 	spin_lock_bh(root_lock);
792 	swap(*tbl, d);
793 	spin_unlock_bh(root_lock);
794 
795 	dist_free(d);
796 	return 0;
797 }
798 
799 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
800 {
801 	const struct tc_netem_slot *c = nla_data(attr);
802 
803 	q->slot_config = *c;
804 	if (q->slot_config.max_packets == 0)
805 		q->slot_config.max_packets = INT_MAX;
806 	if (q->slot_config.max_bytes == 0)
807 		q->slot_config.max_bytes = INT_MAX;
808 	q->slot.packets_left = q->slot_config.max_packets;
809 	q->slot.bytes_left = q->slot_config.max_bytes;
810 	if (q->slot_config.min_delay | q->slot_config.max_delay |
811 	    q->slot_config.dist_jitter)
812 		q->slot.slot_next = ktime_get_ns();
813 	else
814 		q->slot.slot_next = 0;
815 }
816 
817 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
818 {
819 	const struct tc_netem_corr *c = nla_data(attr);
820 
821 	init_crandom(&q->delay_cor, c->delay_corr);
822 	init_crandom(&q->loss_cor, c->loss_corr);
823 	init_crandom(&q->dup_cor, c->dup_corr);
824 }
825 
826 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
827 {
828 	const struct tc_netem_reorder *r = nla_data(attr);
829 
830 	q->reorder = r->probability;
831 	init_crandom(&q->reorder_cor, r->correlation);
832 }
833 
834 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
835 {
836 	const struct tc_netem_corrupt *r = nla_data(attr);
837 
838 	q->corrupt = r->probability;
839 	init_crandom(&q->corrupt_cor, r->correlation);
840 }
841 
842 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
843 {
844 	const struct tc_netem_rate *r = nla_data(attr);
845 
846 	q->rate = r->rate;
847 	q->packet_overhead = r->packet_overhead;
848 	q->cell_size = r->cell_size;
849 	q->cell_overhead = r->cell_overhead;
850 	if (q->cell_size)
851 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
852 	else
853 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
854 }
855 
856 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
857 {
858 	const struct nlattr *la;
859 	int rem;
860 
861 	nla_for_each_nested(la, attr, rem) {
862 		u16 type = nla_type(la);
863 
864 		switch (type) {
865 		case NETEM_LOSS_GI: {
866 			const struct tc_netem_gimodel *gi = nla_data(la);
867 
868 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
869 				pr_info("netem: incorrect gi model size\n");
870 				return -EINVAL;
871 			}
872 
873 			q->loss_model = CLG_4_STATES;
874 
875 			q->clg.state = TX_IN_GAP_PERIOD;
876 			q->clg.a1 = gi->p13;
877 			q->clg.a2 = gi->p31;
878 			q->clg.a3 = gi->p32;
879 			q->clg.a4 = gi->p14;
880 			q->clg.a5 = gi->p23;
881 			break;
882 		}
883 
884 		case NETEM_LOSS_GE: {
885 			const struct tc_netem_gemodel *ge = nla_data(la);
886 
887 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
888 				pr_info("netem: incorrect ge model size\n");
889 				return -EINVAL;
890 			}
891 
892 			q->loss_model = CLG_GILB_ELL;
893 			q->clg.state = GOOD_STATE;
894 			q->clg.a1 = ge->p;
895 			q->clg.a2 = ge->r;
896 			q->clg.a3 = ge->h;
897 			q->clg.a4 = ge->k1;
898 			break;
899 		}
900 
901 		default:
902 			pr_info("netem: unknown loss type %u\n", type);
903 			return -EINVAL;
904 		}
905 	}
906 
907 	return 0;
908 }
909 
910 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
911 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
912 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
913 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
914 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
915 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
916 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
917 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
918 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
919 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
920 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
921 };
922 
923 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
924 		      const struct nla_policy *policy, int len)
925 {
926 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
927 
928 	if (nested_len < 0) {
929 		pr_info("netem: invalid attributes len %d\n", nested_len);
930 		return -EINVAL;
931 	}
932 
933 	if (nested_len >= nla_attr_size(0))
934 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
935 				 nested_len, policy, NULL);
936 
937 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
938 	return 0;
939 }
940 
941 /* Parse netlink message to set options */
942 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
943 			struct netlink_ext_ack *extack)
944 {
945 	struct netem_sched_data *q = qdisc_priv(sch);
946 	struct nlattr *tb[TCA_NETEM_MAX + 1];
947 	struct tc_netem_qopt *qopt;
948 	struct clgstate old_clg;
949 	int old_loss_model = CLG_RANDOM;
950 	int ret;
951 
952 	if (opt == NULL)
953 		return -EINVAL;
954 
955 	qopt = nla_data(opt);
956 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
957 	if (ret < 0)
958 		return ret;
959 
960 	/* backup q->clg and q->loss_model */
961 	old_clg = q->clg;
962 	old_loss_model = q->loss_model;
963 
964 	if (tb[TCA_NETEM_LOSS]) {
965 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
966 		if (ret) {
967 			q->loss_model = old_loss_model;
968 			return ret;
969 		}
970 	} else {
971 		q->loss_model = CLG_RANDOM;
972 	}
973 
974 	if (tb[TCA_NETEM_DELAY_DIST]) {
975 		ret = get_dist_table(sch, &q->delay_dist,
976 				     tb[TCA_NETEM_DELAY_DIST]);
977 		if (ret)
978 			goto get_table_failure;
979 	}
980 
981 	if (tb[TCA_NETEM_SLOT_DIST]) {
982 		ret = get_dist_table(sch, &q->slot_dist,
983 				     tb[TCA_NETEM_SLOT_DIST]);
984 		if (ret)
985 			goto get_table_failure;
986 	}
987 
988 	sch->limit = qopt->limit;
989 
990 	q->latency = PSCHED_TICKS2NS(qopt->latency);
991 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
992 	q->limit = qopt->limit;
993 	q->gap = qopt->gap;
994 	q->counter = 0;
995 	q->loss = qopt->loss;
996 	q->duplicate = qopt->duplicate;
997 
998 	/* for compatibility with earlier versions.
999 	 * if gap is set, need to assume 100% probability
1000 	 */
1001 	if (q->gap)
1002 		q->reorder = ~0;
1003 
1004 	if (tb[TCA_NETEM_CORR])
1005 		get_correlation(q, tb[TCA_NETEM_CORR]);
1006 
1007 	if (tb[TCA_NETEM_REORDER])
1008 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1009 
1010 	if (tb[TCA_NETEM_CORRUPT])
1011 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1012 
1013 	if (tb[TCA_NETEM_RATE])
1014 		get_rate(q, tb[TCA_NETEM_RATE]);
1015 
1016 	if (tb[TCA_NETEM_RATE64])
1017 		q->rate = max_t(u64, q->rate,
1018 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1019 
1020 	if (tb[TCA_NETEM_LATENCY64])
1021 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1022 
1023 	if (tb[TCA_NETEM_JITTER64])
1024 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1025 
1026 	if (tb[TCA_NETEM_ECN])
1027 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1028 
1029 	if (tb[TCA_NETEM_SLOT])
1030 		get_slot(q, tb[TCA_NETEM_SLOT]);
1031 
1032 	return ret;
1033 
1034 get_table_failure:
1035 	/* recover clg and loss_model, in case of
1036 	 * q->clg and q->loss_model were modified
1037 	 * in get_loss_clg()
1038 	 */
1039 	q->clg = old_clg;
1040 	q->loss_model = old_loss_model;
1041 	return ret;
1042 }
1043 
1044 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1045 		      struct netlink_ext_ack *extack)
1046 {
1047 	struct netem_sched_data *q = qdisc_priv(sch);
1048 	int ret;
1049 
1050 	qdisc_watchdog_init(&q->watchdog, sch);
1051 
1052 	if (!opt)
1053 		return -EINVAL;
1054 
1055 	q->loss_model = CLG_RANDOM;
1056 	ret = netem_change(sch, opt, extack);
1057 	if (ret)
1058 		pr_info("netem: change failed\n");
1059 	return ret;
1060 }
1061 
1062 static void netem_destroy(struct Qdisc *sch)
1063 {
1064 	struct netem_sched_data *q = qdisc_priv(sch);
1065 
1066 	qdisc_watchdog_cancel(&q->watchdog);
1067 	if (q->qdisc)
1068 		qdisc_put(q->qdisc);
1069 	dist_free(q->delay_dist);
1070 	dist_free(q->slot_dist);
1071 }
1072 
1073 static int dump_loss_model(const struct netem_sched_data *q,
1074 			   struct sk_buff *skb)
1075 {
1076 	struct nlattr *nest;
1077 
1078 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1079 	if (nest == NULL)
1080 		goto nla_put_failure;
1081 
1082 	switch (q->loss_model) {
1083 	case CLG_RANDOM:
1084 		/* legacy loss model */
1085 		nla_nest_cancel(skb, nest);
1086 		return 0;	/* no data */
1087 
1088 	case CLG_4_STATES: {
1089 		struct tc_netem_gimodel gi = {
1090 			.p13 = q->clg.a1,
1091 			.p31 = q->clg.a2,
1092 			.p32 = q->clg.a3,
1093 			.p14 = q->clg.a4,
1094 			.p23 = q->clg.a5,
1095 		};
1096 
1097 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1098 			goto nla_put_failure;
1099 		break;
1100 	}
1101 	case CLG_GILB_ELL: {
1102 		struct tc_netem_gemodel ge = {
1103 			.p = q->clg.a1,
1104 			.r = q->clg.a2,
1105 			.h = q->clg.a3,
1106 			.k1 = q->clg.a4,
1107 		};
1108 
1109 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1110 			goto nla_put_failure;
1111 		break;
1112 	}
1113 	}
1114 
1115 	nla_nest_end(skb, nest);
1116 	return 0;
1117 
1118 nla_put_failure:
1119 	nla_nest_cancel(skb, nest);
1120 	return -1;
1121 }
1122 
1123 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1124 {
1125 	const struct netem_sched_data *q = qdisc_priv(sch);
1126 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1127 	struct tc_netem_qopt qopt;
1128 	struct tc_netem_corr cor;
1129 	struct tc_netem_reorder reorder;
1130 	struct tc_netem_corrupt corrupt;
1131 	struct tc_netem_rate rate;
1132 	struct tc_netem_slot slot;
1133 
1134 	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1135 			     UINT_MAX);
1136 	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1137 			    UINT_MAX);
1138 	qopt.limit = q->limit;
1139 	qopt.loss = q->loss;
1140 	qopt.gap = q->gap;
1141 	qopt.duplicate = q->duplicate;
1142 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1143 		goto nla_put_failure;
1144 
1145 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1146 		goto nla_put_failure;
1147 
1148 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1149 		goto nla_put_failure;
1150 
1151 	cor.delay_corr = q->delay_cor.rho;
1152 	cor.loss_corr = q->loss_cor.rho;
1153 	cor.dup_corr = q->dup_cor.rho;
1154 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1155 		goto nla_put_failure;
1156 
1157 	reorder.probability = q->reorder;
1158 	reorder.correlation = q->reorder_cor.rho;
1159 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1160 		goto nla_put_failure;
1161 
1162 	corrupt.probability = q->corrupt;
1163 	corrupt.correlation = q->corrupt_cor.rho;
1164 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1165 		goto nla_put_failure;
1166 
1167 	if (q->rate >= (1ULL << 32)) {
1168 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1169 				      TCA_NETEM_PAD))
1170 			goto nla_put_failure;
1171 		rate.rate = ~0U;
1172 	} else {
1173 		rate.rate = q->rate;
1174 	}
1175 	rate.packet_overhead = q->packet_overhead;
1176 	rate.cell_size = q->cell_size;
1177 	rate.cell_overhead = q->cell_overhead;
1178 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1179 		goto nla_put_failure;
1180 
1181 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1182 		goto nla_put_failure;
1183 
1184 	if (dump_loss_model(q, skb) != 0)
1185 		goto nla_put_failure;
1186 
1187 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1188 	    q->slot_config.dist_jitter) {
1189 		slot = q->slot_config;
1190 		if (slot.max_packets == INT_MAX)
1191 			slot.max_packets = 0;
1192 		if (slot.max_bytes == INT_MAX)
1193 			slot.max_bytes = 0;
1194 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1195 			goto nla_put_failure;
1196 	}
1197 
1198 	return nla_nest_end(skb, nla);
1199 
1200 nla_put_failure:
1201 	nlmsg_trim(skb, nla);
1202 	return -1;
1203 }
1204 
1205 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1206 			  struct sk_buff *skb, struct tcmsg *tcm)
1207 {
1208 	struct netem_sched_data *q = qdisc_priv(sch);
1209 
1210 	if (cl != 1 || !q->qdisc) 	/* only one class */
1211 		return -ENOENT;
1212 
1213 	tcm->tcm_handle |= TC_H_MIN(1);
1214 	tcm->tcm_info = q->qdisc->handle;
1215 
1216 	return 0;
1217 }
1218 
1219 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1220 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1221 {
1222 	struct netem_sched_data *q = qdisc_priv(sch);
1223 
1224 	*old = qdisc_replace(sch, new, &q->qdisc);
1225 	return 0;
1226 }
1227 
1228 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1229 {
1230 	struct netem_sched_data *q = qdisc_priv(sch);
1231 	return q->qdisc;
1232 }
1233 
1234 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1235 {
1236 	return 1;
1237 }
1238 
1239 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1240 {
1241 	if (!walker->stop) {
1242 		if (walker->count >= walker->skip)
1243 			if (walker->fn(sch, 1, walker) < 0) {
1244 				walker->stop = 1;
1245 				return;
1246 			}
1247 		walker->count++;
1248 	}
1249 }
1250 
1251 static const struct Qdisc_class_ops netem_class_ops = {
1252 	.graft		=	netem_graft,
1253 	.leaf		=	netem_leaf,
1254 	.find		=	netem_find,
1255 	.walk		=	netem_walk,
1256 	.dump		=	netem_dump_class,
1257 };
1258 
1259 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1260 	.id		=	"netem",
1261 	.cl_ops		=	&netem_class_ops,
1262 	.priv_size	=	sizeof(struct netem_sched_data),
1263 	.enqueue	=	netem_enqueue,
1264 	.dequeue	=	netem_dequeue,
1265 	.peek		=	qdisc_peek_dequeued,
1266 	.init		=	netem_init,
1267 	.reset		=	netem_reset,
1268 	.destroy	=	netem_destroy,
1269 	.change		=	netem_change,
1270 	.dump		=	netem_dump,
1271 	.owner		=	THIS_MODULE,
1272 };
1273 
1274 
1275 static int __init netem_module_init(void)
1276 {
1277 	pr_info("netem: version " VERSION "\n");
1278 	return register_qdisc(&netem_qdisc_ops);
1279 }
1280 static void __exit netem_module_exit(void)
1281 {
1282 	unregister_qdisc(&netem_qdisc_ops);
1283 }
1284 module_init(netem_module_init)
1285 module_exit(netem_module_exit)
1286 MODULE_LICENSE("GPL");
1287