xref: /openbmc/linux/net/sched/sch_netem.c (revision f519f0be)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct disttable {
72 	u32  size;
73 	s16 table[0];
74 };
75 
76 struct netem_sched_data {
77 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
78 	struct rb_root t_root;
79 
80 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
81 	struct sk_buff	*t_head;
82 	struct sk_buff	*t_tail;
83 
84 	/* optional qdisc for classful handling (NULL at netem init) */
85 	struct Qdisc	*qdisc;
86 
87 	struct qdisc_watchdog watchdog;
88 
89 	s64 latency;
90 	s64 jitter;
91 
92 	u32 loss;
93 	u32 ecn;
94 	u32 limit;
95 	u32 counter;
96 	u32 gap;
97 	u32 duplicate;
98 	u32 reorder;
99 	u32 corrupt;
100 	u64 rate;
101 	s32 packet_overhead;
102 	u32 cell_size;
103 	struct reciprocal_value cell_size_reciprocal;
104 	s32 cell_overhead;
105 
106 	struct crndstate {
107 		u32 last;
108 		u32 rho;
109 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
110 
111 	struct disttable *delay_dist;
112 
113 	enum  {
114 		CLG_RANDOM,
115 		CLG_4_STATES,
116 		CLG_GILB_ELL,
117 	} loss_model;
118 
119 	enum {
120 		TX_IN_GAP_PERIOD = 1,
121 		TX_IN_BURST_PERIOD,
122 		LOST_IN_GAP_PERIOD,
123 		LOST_IN_BURST_PERIOD,
124 	} _4_state_model;
125 
126 	enum {
127 		GOOD_STATE = 1,
128 		BAD_STATE,
129 	} GE_state_model;
130 
131 	/* Correlated Loss Generation models */
132 	struct clgstate {
133 		/* state of the Markov chain */
134 		u8 state;
135 
136 		/* 4-states and Gilbert-Elliot models */
137 		u32 a1;	/* p13 for 4-states or p for GE */
138 		u32 a2;	/* p31 for 4-states or r for GE */
139 		u32 a3;	/* p32 for 4-states or h for GE */
140 		u32 a4;	/* p14 for 4-states or 1-k for GE */
141 		u32 a5; /* p23 used only in 4-states */
142 	} clg;
143 
144 	struct tc_netem_slot slot_config;
145 	struct slotstate {
146 		u64 slot_next;
147 		s32 packets_left;
148 		s32 bytes_left;
149 	} slot;
150 
151 	struct disttable *slot_dist;
152 };
153 
154 /* Time stamp put into socket buffer control block
155  * Only valid when skbs are in our internal t(ime)fifo queue.
156  *
157  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
158  * and skb->next & skb->prev are scratch space for a qdisc,
159  * we save skb->tstamp value in skb->cb[] before destroying it.
160  */
161 struct netem_skb_cb {
162 	u64	        time_to_send;
163 };
164 
165 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
166 {
167 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
168 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
169 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
170 }
171 
172 /* init_crandom - initialize correlated random number generator
173  * Use entropy source for initial seed.
174  */
175 static void init_crandom(struct crndstate *state, unsigned long rho)
176 {
177 	state->rho = rho;
178 	state->last = prandom_u32();
179 }
180 
181 /* get_crandom - correlated random number generator
182  * Next number depends on last value.
183  * rho is scaled to avoid floating point.
184  */
185 static u32 get_crandom(struct crndstate *state)
186 {
187 	u64 value, rho;
188 	unsigned long answer;
189 
190 	if (!state || state->rho == 0)	/* no correlation */
191 		return prandom_u32();
192 
193 	value = prandom_u32();
194 	rho = (u64)state->rho + 1;
195 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
196 	state->last = answer;
197 	return answer;
198 }
199 
200 /* loss_4state - 4-state model loss generator
201  * Generates losses according to the 4-state Markov chain adopted in
202  * the GI (General and Intuitive) loss model.
203  */
204 static bool loss_4state(struct netem_sched_data *q)
205 {
206 	struct clgstate *clg = &q->clg;
207 	u32 rnd = prandom_u32();
208 
209 	/*
210 	 * Makes a comparison between rnd and the transition
211 	 * probabilities outgoing from the current state, then decides the
212 	 * next state and if the next packet has to be transmitted or lost.
213 	 * The four states correspond to:
214 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
215 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
216 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
217 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
218 	 */
219 	switch (clg->state) {
220 	case TX_IN_GAP_PERIOD:
221 		if (rnd < clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
225 			clg->state = LOST_IN_GAP_PERIOD;
226 			return true;
227 		} else if (clg->a1 + clg->a4 < rnd) {
228 			clg->state = TX_IN_GAP_PERIOD;
229 		}
230 
231 		break;
232 	case TX_IN_BURST_PERIOD:
233 		if (rnd < clg->a5) {
234 			clg->state = LOST_IN_GAP_PERIOD;
235 			return true;
236 		} else {
237 			clg->state = TX_IN_BURST_PERIOD;
238 		}
239 
240 		break;
241 	case LOST_IN_GAP_PERIOD:
242 		if (rnd < clg->a3)
243 			clg->state = TX_IN_BURST_PERIOD;
244 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
245 			clg->state = TX_IN_GAP_PERIOD;
246 		} else if (clg->a2 + clg->a3 < rnd) {
247 			clg->state = LOST_IN_GAP_PERIOD;
248 			return true;
249 		}
250 		break;
251 	case LOST_IN_BURST_PERIOD:
252 		clg->state = TX_IN_GAP_PERIOD;
253 		break;
254 	}
255 
256 	return false;
257 }
258 
259 /* loss_gilb_ell - Gilbert-Elliot model loss generator
260  * Generates losses according to the Gilbert-Elliot loss model or
261  * its special cases  (Gilbert or Simple Gilbert)
262  *
263  * Makes a comparison between random number and the transition
264  * probabilities outgoing from the current state, then decides the
265  * next state. A second random number is extracted and the comparison
266  * with the loss probability of the current state decides if the next
267  * packet will be transmitted or lost.
268  */
269 static bool loss_gilb_ell(struct netem_sched_data *q)
270 {
271 	struct clgstate *clg = &q->clg;
272 
273 	switch (clg->state) {
274 	case GOOD_STATE:
275 		if (prandom_u32() < clg->a1)
276 			clg->state = BAD_STATE;
277 		if (prandom_u32() < clg->a4)
278 			return true;
279 		break;
280 	case BAD_STATE:
281 		if (prandom_u32() < clg->a2)
282 			clg->state = GOOD_STATE;
283 		if (prandom_u32() > clg->a3)
284 			return true;
285 	}
286 
287 	return false;
288 }
289 
290 static bool loss_event(struct netem_sched_data *q)
291 {
292 	switch (q->loss_model) {
293 	case CLG_RANDOM:
294 		/* Random packet drop 0 => none, ~0 => all */
295 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
296 
297 	case CLG_4_STATES:
298 		/* 4state loss model algorithm (used also for GI model)
299 		* Extracts a value from the markov 4 state loss generator,
300 		* if it is 1 drops a packet and if needed writes the event in
301 		* the kernel logs
302 		*/
303 		return loss_4state(q);
304 
305 	case CLG_GILB_ELL:
306 		/* Gilbert-Elliot loss model algorithm
307 		* Extracts a value from the Gilbert-Elliot loss generator,
308 		* if it is 1 drops a packet and if needed writes the event in
309 		* the kernel logs
310 		*/
311 		return loss_gilb_ell(q);
312 	}
313 
314 	return false;	/* not reached */
315 }
316 
317 
318 /* tabledist - return a pseudo-randomly distributed value with mean mu and
319  * std deviation sigma.  Uses table lookup to approximate the desired
320  * distribution, and a uniformly-distributed pseudo-random source.
321  */
322 static s64 tabledist(s64 mu, s32 sigma,
323 		     struct crndstate *state,
324 		     const struct disttable *dist)
325 {
326 	s64 x;
327 	long t;
328 	u32 rnd;
329 
330 	if (sigma == 0)
331 		return mu;
332 
333 	rnd = get_crandom(state);
334 
335 	/* default uniform distribution */
336 	if (dist == NULL)
337 		return ((rnd % (2 * sigma)) + mu) - sigma;
338 
339 	t = dist->table[rnd % dist->size];
340 	x = (sigma % NETEM_DIST_SCALE) * t;
341 	if (x >= 0)
342 		x += NETEM_DIST_SCALE/2;
343 	else
344 		x -= NETEM_DIST_SCALE/2;
345 
346 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
347 }
348 
349 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
350 {
351 	len += q->packet_overhead;
352 
353 	if (q->cell_size) {
354 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
355 
356 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
357 			cells++;
358 		len = cells * (q->cell_size + q->cell_overhead);
359 	}
360 
361 	return div64_u64(len * NSEC_PER_SEC, q->rate);
362 }
363 
364 static void tfifo_reset(struct Qdisc *sch)
365 {
366 	struct netem_sched_data *q = qdisc_priv(sch);
367 	struct rb_node *p = rb_first(&q->t_root);
368 
369 	while (p) {
370 		struct sk_buff *skb = rb_to_skb(p);
371 
372 		p = rb_next(p);
373 		rb_erase(&skb->rbnode, &q->t_root);
374 		rtnl_kfree_skbs(skb, skb);
375 	}
376 
377 	rtnl_kfree_skbs(q->t_head, q->t_tail);
378 	q->t_head = NULL;
379 	q->t_tail = NULL;
380 }
381 
382 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
383 {
384 	struct netem_sched_data *q = qdisc_priv(sch);
385 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
386 
387 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
388 		if (q->t_tail)
389 			q->t_tail->next = nskb;
390 		else
391 			q->t_head = nskb;
392 		q->t_tail = nskb;
393 	} else {
394 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
395 
396 		while (*p) {
397 			struct sk_buff *skb;
398 
399 			parent = *p;
400 			skb = rb_to_skb(parent);
401 			if (tnext >= netem_skb_cb(skb)->time_to_send)
402 				p = &parent->rb_right;
403 			else
404 				p = &parent->rb_left;
405 		}
406 		rb_link_node(&nskb->rbnode, parent, p);
407 		rb_insert_color(&nskb->rbnode, &q->t_root);
408 	}
409 	sch->q.qlen++;
410 }
411 
412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
413  * when we statistically choose to corrupt one, we instead segment it, returning
414  * the first packet to be corrupted, and re-enqueue the remaining frames
415  */
416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
417 				     struct sk_buff **to_free)
418 {
419 	struct sk_buff *segs;
420 	netdev_features_t features = netif_skb_features(skb);
421 
422 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
423 
424 	if (IS_ERR_OR_NULL(segs)) {
425 		qdisc_drop(skb, sch, to_free);
426 		return NULL;
427 	}
428 	consume_skb(skb);
429 	return segs;
430 }
431 
432 /*
433  * Insert one skb into qdisc.
434  * Note: parent depends on return value to account for queue length.
435  * 	NET_XMIT_DROP: queue length didn't change.
436  *      NET_XMIT_SUCCESS: one skb was queued.
437  */
438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
439 			 struct sk_buff **to_free)
440 {
441 	struct netem_sched_data *q = qdisc_priv(sch);
442 	/* We don't fill cb now as skb_unshare() may invalidate it */
443 	struct netem_skb_cb *cb;
444 	struct sk_buff *skb2;
445 	struct sk_buff *segs = NULL;
446 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
447 	int nb = 0;
448 	int count = 1;
449 	int rc = NET_XMIT_SUCCESS;
450 	int rc_drop = NET_XMIT_DROP;
451 
452 	/* Do not fool qdisc_drop_all() */
453 	skb->prev = NULL;
454 
455 	/* Random duplication */
456 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
457 		++count;
458 
459 	/* Drop packet? */
460 	if (loss_event(q)) {
461 		if (q->ecn && INET_ECN_set_ce(skb))
462 			qdisc_qstats_drop(sch); /* mark packet */
463 		else
464 			--count;
465 	}
466 	if (count == 0) {
467 		qdisc_qstats_drop(sch);
468 		__qdisc_drop(skb, to_free);
469 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
470 	}
471 
472 	/* If a delay is expected, orphan the skb. (orphaning usually takes
473 	 * place at TX completion time, so _before_ the link transit delay)
474 	 */
475 	if (q->latency || q->jitter || q->rate)
476 		skb_orphan_partial(skb);
477 
478 	/*
479 	 * If we need to duplicate packet, then re-insert at top of the
480 	 * qdisc tree, since parent queuer expects that only one
481 	 * skb will be queued.
482 	 */
483 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
484 		struct Qdisc *rootq = qdisc_root(sch);
485 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
486 
487 		q->duplicate = 0;
488 		rootq->enqueue(skb2, rootq, to_free);
489 		q->duplicate = dupsave;
490 		rc_drop = NET_XMIT_SUCCESS;
491 	}
492 
493 	/*
494 	 * Randomized packet corruption.
495 	 * Make copy if needed since we are modifying
496 	 * If packet is going to be hardware checksummed, then
497 	 * do it now in software before we mangle it.
498 	 */
499 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
500 		if (skb_is_gso(skb)) {
501 			segs = netem_segment(skb, sch, to_free);
502 			if (!segs)
503 				return rc_drop;
504 		} else {
505 			segs = skb;
506 		}
507 
508 		skb = segs;
509 		segs = segs->next;
510 
511 		skb = skb_unshare(skb, GFP_ATOMIC);
512 		if (unlikely(!skb)) {
513 			qdisc_qstats_drop(sch);
514 			goto finish_segs;
515 		}
516 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
517 		    skb_checksum_help(skb)) {
518 			qdisc_drop(skb, sch, to_free);
519 			goto finish_segs;
520 		}
521 
522 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
523 			1<<(prandom_u32() % 8);
524 	}
525 
526 	if (unlikely(sch->q.qlen >= sch->limit)) {
527 		qdisc_drop_all(skb, sch, to_free);
528 		return rc_drop;
529 	}
530 
531 	qdisc_qstats_backlog_inc(sch, skb);
532 
533 	cb = netem_skb_cb(skb);
534 	if (q->gap == 0 ||		/* not doing reordering */
535 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
536 	    q->reorder < get_crandom(&q->reorder_cor)) {
537 		u64 now;
538 		s64 delay;
539 
540 		delay = tabledist(q->latency, q->jitter,
541 				  &q->delay_cor, q->delay_dist);
542 
543 		now = ktime_get_ns();
544 
545 		if (q->rate) {
546 			struct netem_skb_cb *last = NULL;
547 
548 			if (sch->q.tail)
549 				last = netem_skb_cb(sch->q.tail);
550 			if (q->t_root.rb_node) {
551 				struct sk_buff *t_skb;
552 				struct netem_skb_cb *t_last;
553 
554 				t_skb = skb_rb_last(&q->t_root);
555 				t_last = netem_skb_cb(t_skb);
556 				if (!last ||
557 				    t_last->time_to_send > last->time_to_send)
558 					last = t_last;
559 			}
560 			if (q->t_tail) {
561 				struct netem_skb_cb *t_last =
562 					netem_skb_cb(q->t_tail);
563 
564 				if (!last ||
565 				    t_last->time_to_send > last->time_to_send)
566 					last = t_last;
567 			}
568 
569 			if (last) {
570 				/*
571 				 * Last packet in queue is reference point (now),
572 				 * calculate this time bonus and subtract
573 				 * from delay.
574 				 */
575 				delay -= last->time_to_send - now;
576 				delay = max_t(s64, 0, delay);
577 				now = last->time_to_send;
578 			}
579 
580 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
581 		}
582 
583 		cb->time_to_send = now + delay;
584 		++q->counter;
585 		tfifo_enqueue(skb, sch);
586 	} else {
587 		/*
588 		 * Do re-ordering by putting one out of N packets at the front
589 		 * of the queue.
590 		 */
591 		cb->time_to_send = ktime_get_ns();
592 		q->counter = 0;
593 
594 		__qdisc_enqueue_head(skb, &sch->q);
595 		sch->qstats.requeues++;
596 	}
597 
598 finish_segs:
599 	if (segs) {
600 		while (segs) {
601 			skb2 = segs->next;
602 			skb_mark_not_on_list(segs);
603 			qdisc_skb_cb(segs)->pkt_len = segs->len;
604 			last_len = segs->len;
605 			rc = qdisc_enqueue(segs, sch, to_free);
606 			if (rc != NET_XMIT_SUCCESS) {
607 				if (net_xmit_drop_count(rc))
608 					qdisc_qstats_drop(sch);
609 			} else {
610 				nb++;
611 				len += last_len;
612 			}
613 			segs = skb2;
614 		}
615 		sch->q.qlen += nb;
616 		if (nb > 1)
617 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
618 	}
619 	return NET_XMIT_SUCCESS;
620 }
621 
622 /* Delay the next round with a new future slot with a
623  * correct number of bytes and packets.
624  */
625 
626 static void get_slot_next(struct netem_sched_data *q, u64 now)
627 {
628 	s64 next_delay;
629 
630 	if (!q->slot_dist)
631 		next_delay = q->slot_config.min_delay +
632 				(prandom_u32() *
633 				 (q->slot_config.max_delay -
634 				  q->slot_config.min_delay) >> 32);
635 	else
636 		next_delay = tabledist(q->slot_config.dist_delay,
637 				       (s32)(q->slot_config.dist_jitter),
638 				       NULL, q->slot_dist);
639 
640 	q->slot.slot_next = now + next_delay;
641 	q->slot.packets_left = q->slot_config.max_packets;
642 	q->slot.bytes_left = q->slot_config.max_bytes;
643 }
644 
645 static struct sk_buff *netem_peek(struct netem_sched_data *q)
646 {
647 	struct sk_buff *skb = skb_rb_first(&q->t_root);
648 	u64 t1, t2;
649 
650 	if (!skb)
651 		return q->t_head;
652 	if (!q->t_head)
653 		return skb;
654 
655 	t1 = netem_skb_cb(skb)->time_to_send;
656 	t2 = netem_skb_cb(q->t_head)->time_to_send;
657 	if (t1 < t2)
658 		return skb;
659 	return q->t_head;
660 }
661 
662 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
663 {
664 	if (skb == q->t_head) {
665 		q->t_head = skb->next;
666 		if (!q->t_head)
667 			q->t_tail = NULL;
668 	} else {
669 		rb_erase(&skb->rbnode, &q->t_root);
670 	}
671 }
672 
673 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
674 {
675 	struct netem_sched_data *q = qdisc_priv(sch);
676 	struct sk_buff *skb;
677 
678 tfifo_dequeue:
679 	skb = __qdisc_dequeue_head(&sch->q);
680 	if (skb) {
681 		qdisc_qstats_backlog_dec(sch, skb);
682 deliver:
683 		qdisc_bstats_update(sch, skb);
684 		return skb;
685 	}
686 	skb = netem_peek(q);
687 	if (skb) {
688 		u64 time_to_send;
689 		u64 now = ktime_get_ns();
690 
691 		/* if more time remaining? */
692 		time_to_send = netem_skb_cb(skb)->time_to_send;
693 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
694 			get_slot_next(q, now);
695 
696 		if (time_to_send <= now && q->slot.slot_next <= now) {
697 			netem_erase_head(q, skb);
698 			sch->q.qlen--;
699 			qdisc_qstats_backlog_dec(sch, skb);
700 			skb->next = NULL;
701 			skb->prev = NULL;
702 			/* skb->dev shares skb->rbnode area,
703 			 * we need to restore its value.
704 			 */
705 			skb->dev = qdisc_dev(sch);
706 
707 			if (q->slot.slot_next) {
708 				q->slot.packets_left--;
709 				q->slot.bytes_left -= qdisc_pkt_len(skb);
710 				if (q->slot.packets_left <= 0 ||
711 				    q->slot.bytes_left <= 0)
712 					get_slot_next(q, now);
713 			}
714 
715 			if (q->qdisc) {
716 				unsigned int pkt_len = qdisc_pkt_len(skb);
717 				struct sk_buff *to_free = NULL;
718 				int err;
719 
720 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
721 				kfree_skb_list(to_free);
722 				if (err != NET_XMIT_SUCCESS &&
723 				    net_xmit_drop_count(err)) {
724 					qdisc_qstats_drop(sch);
725 					qdisc_tree_reduce_backlog(sch, 1,
726 								  pkt_len);
727 				}
728 				goto tfifo_dequeue;
729 			}
730 			goto deliver;
731 		}
732 
733 		if (q->qdisc) {
734 			skb = q->qdisc->ops->dequeue(q->qdisc);
735 			if (skb)
736 				goto deliver;
737 		}
738 
739 		qdisc_watchdog_schedule_ns(&q->watchdog,
740 					   max(time_to_send,
741 					       q->slot.slot_next));
742 	}
743 
744 	if (q->qdisc) {
745 		skb = q->qdisc->ops->dequeue(q->qdisc);
746 		if (skb)
747 			goto deliver;
748 	}
749 	return NULL;
750 }
751 
752 static void netem_reset(struct Qdisc *sch)
753 {
754 	struct netem_sched_data *q = qdisc_priv(sch);
755 
756 	qdisc_reset_queue(sch);
757 	tfifo_reset(sch);
758 	if (q->qdisc)
759 		qdisc_reset(q->qdisc);
760 	qdisc_watchdog_cancel(&q->watchdog);
761 }
762 
763 static void dist_free(struct disttable *d)
764 {
765 	kvfree(d);
766 }
767 
768 /*
769  * Distribution data is a variable size payload containing
770  * signed 16 bit values.
771  */
772 
773 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
774 			  const struct nlattr *attr)
775 {
776 	size_t n = nla_len(attr)/sizeof(__s16);
777 	const __s16 *data = nla_data(attr);
778 	spinlock_t *root_lock;
779 	struct disttable *d;
780 	int i;
781 
782 	if (n > NETEM_DIST_MAX)
783 		return -EINVAL;
784 
785 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
786 	if (!d)
787 		return -ENOMEM;
788 
789 	d->size = n;
790 	for (i = 0; i < n; i++)
791 		d->table[i] = data[i];
792 
793 	root_lock = qdisc_root_sleeping_lock(sch);
794 
795 	spin_lock_bh(root_lock);
796 	swap(*tbl, d);
797 	spin_unlock_bh(root_lock);
798 
799 	dist_free(d);
800 	return 0;
801 }
802 
803 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
804 {
805 	const struct tc_netem_slot *c = nla_data(attr);
806 
807 	q->slot_config = *c;
808 	if (q->slot_config.max_packets == 0)
809 		q->slot_config.max_packets = INT_MAX;
810 	if (q->slot_config.max_bytes == 0)
811 		q->slot_config.max_bytes = INT_MAX;
812 	q->slot.packets_left = q->slot_config.max_packets;
813 	q->slot.bytes_left = q->slot_config.max_bytes;
814 	if (q->slot_config.min_delay | q->slot_config.max_delay |
815 	    q->slot_config.dist_jitter)
816 		q->slot.slot_next = ktime_get_ns();
817 	else
818 		q->slot.slot_next = 0;
819 }
820 
821 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
822 {
823 	const struct tc_netem_corr *c = nla_data(attr);
824 
825 	init_crandom(&q->delay_cor, c->delay_corr);
826 	init_crandom(&q->loss_cor, c->loss_corr);
827 	init_crandom(&q->dup_cor, c->dup_corr);
828 }
829 
830 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
831 {
832 	const struct tc_netem_reorder *r = nla_data(attr);
833 
834 	q->reorder = r->probability;
835 	init_crandom(&q->reorder_cor, r->correlation);
836 }
837 
838 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 	const struct tc_netem_corrupt *r = nla_data(attr);
841 
842 	q->corrupt = r->probability;
843 	init_crandom(&q->corrupt_cor, r->correlation);
844 }
845 
846 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 	const struct tc_netem_rate *r = nla_data(attr);
849 
850 	q->rate = r->rate;
851 	q->packet_overhead = r->packet_overhead;
852 	q->cell_size = r->cell_size;
853 	q->cell_overhead = r->cell_overhead;
854 	if (q->cell_size)
855 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
856 	else
857 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
858 }
859 
860 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
861 {
862 	const struct nlattr *la;
863 	int rem;
864 
865 	nla_for_each_nested(la, attr, rem) {
866 		u16 type = nla_type(la);
867 
868 		switch (type) {
869 		case NETEM_LOSS_GI: {
870 			const struct tc_netem_gimodel *gi = nla_data(la);
871 
872 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
873 				pr_info("netem: incorrect gi model size\n");
874 				return -EINVAL;
875 			}
876 
877 			q->loss_model = CLG_4_STATES;
878 
879 			q->clg.state = TX_IN_GAP_PERIOD;
880 			q->clg.a1 = gi->p13;
881 			q->clg.a2 = gi->p31;
882 			q->clg.a3 = gi->p32;
883 			q->clg.a4 = gi->p14;
884 			q->clg.a5 = gi->p23;
885 			break;
886 		}
887 
888 		case NETEM_LOSS_GE: {
889 			const struct tc_netem_gemodel *ge = nla_data(la);
890 
891 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
892 				pr_info("netem: incorrect ge model size\n");
893 				return -EINVAL;
894 			}
895 
896 			q->loss_model = CLG_GILB_ELL;
897 			q->clg.state = GOOD_STATE;
898 			q->clg.a1 = ge->p;
899 			q->clg.a2 = ge->r;
900 			q->clg.a3 = ge->h;
901 			q->clg.a4 = ge->k1;
902 			break;
903 		}
904 
905 		default:
906 			pr_info("netem: unknown loss type %u\n", type);
907 			return -EINVAL;
908 		}
909 	}
910 
911 	return 0;
912 }
913 
914 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
915 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
916 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
917 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
918 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
919 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
920 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
921 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
922 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
923 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
924 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
925 };
926 
927 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
928 		      const struct nla_policy *policy, int len)
929 {
930 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
931 
932 	if (nested_len < 0) {
933 		pr_info("netem: invalid attributes len %d\n", nested_len);
934 		return -EINVAL;
935 	}
936 
937 	if (nested_len >= nla_attr_size(0))
938 		return nla_parse_deprecated(tb, maxtype,
939 					    nla_data(nla) + NLA_ALIGN(len),
940 					    nested_len, policy, NULL);
941 
942 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
943 	return 0;
944 }
945 
946 /* Parse netlink message to set options */
947 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
948 			struct netlink_ext_ack *extack)
949 {
950 	struct netem_sched_data *q = qdisc_priv(sch);
951 	struct nlattr *tb[TCA_NETEM_MAX + 1];
952 	struct tc_netem_qopt *qopt;
953 	struct clgstate old_clg;
954 	int old_loss_model = CLG_RANDOM;
955 	int ret;
956 
957 	if (opt == NULL)
958 		return -EINVAL;
959 
960 	qopt = nla_data(opt);
961 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
962 	if (ret < 0)
963 		return ret;
964 
965 	/* backup q->clg and q->loss_model */
966 	old_clg = q->clg;
967 	old_loss_model = q->loss_model;
968 
969 	if (tb[TCA_NETEM_LOSS]) {
970 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
971 		if (ret) {
972 			q->loss_model = old_loss_model;
973 			return ret;
974 		}
975 	} else {
976 		q->loss_model = CLG_RANDOM;
977 	}
978 
979 	if (tb[TCA_NETEM_DELAY_DIST]) {
980 		ret = get_dist_table(sch, &q->delay_dist,
981 				     tb[TCA_NETEM_DELAY_DIST]);
982 		if (ret)
983 			goto get_table_failure;
984 	}
985 
986 	if (tb[TCA_NETEM_SLOT_DIST]) {
987 		ret = get_dist_table(sch, &q->slot_dist,
988 				     tb[TCA_NETEM_SLOT_DIST]);
989 		if (ret)
990 			goto get_table_failure;
991 	}
992 
993 	sch->limit = qopt->limit;
994 
995 	q->latency = PSCHED_TICKS2NS(qopt->latency);
996 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
997 	q->limit = qopt->limit;
998 	q->gap = qopt->gap;
999 	q->counter = 0;
1000 	q->loss = qopt->loss;
1001 	q->duplicate = qopt->duplicate;
1002 
1003 	/* for compatibility with earlier versions.
1004 	 * if gap is set, need to assume 100% probability
1005 	 */
1006 	if (q->gap)
1007 		q->reorder = ~0;
1008 
1009 	if (tb[TCA_NETEM_CORR])
1010 		get_correlation(q, tb[TCA_NETEM_CORR]);
1011 
1012 	if (tb[TCA_NETEM_REORDER])
1013 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1014 
1015 	if (tb[TCA_NETEM_CORRUPT])
1016 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1017 
1018 	if (tb[TCA_NETEM_RATE])
1019 		get_rate(q, tb[TCA_NETEM_RATE]);
1020 
1021 	if (tb[TCA_NETEM_RATE64])
1022 		q->rate = max_t(u64, q->rate,
1023 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1024 
1025 	if (tb[TCA_NETEM_LATENCY64])
1026 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1027 
1028 	if (tb[TCA_NETEM_JITTER64])
1029 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1030 
1031 	if (tb[TCA_NETEM_ECN])
1032 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1033 
1034 	if (tb[TCA_NETEM_SLOT])
1035 		get_slot(q, tb[TCA_NETEM_SLOT]);
1036 
1037 	return ret;
1038 
1039 get_table_failure:
1040 	/* recover clg and loss_model, in case of
1041 	 * q->clg and q->loss_model were modified
1042 	 * in get_loss_clg()
1043 	 */
1044 	q->clg = old_clg;
1045 	q->loss_model = old_loss_model;
1046 	return ret;
1047 }
1048 
1049 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1050 		      struct netlink_ext_ack *extack)
1051 {
1052 	struct netem_sched_data *q = qdisc_priv(sch);
1053 	int ret;
1054 
1055 	qdisc_watchdog_init(&q->watchdog, sch);
1056 
1057 	if (!opt)
1058 		return -EINVAL;
1059 
1060 	q->loss_model = CLG_RANDOM;
1061 	ret = netem_change(sch, opt, extack);
1062 	if (ret)
1063 		pr_info("netem: change failed\n");
1064 	return ret;
1065 }
1066 
1067 static void netem_destroy(struct Qdisc *sch)
1068 {
1069 	struct netem_sched_data *q = qdisc_priv(sch);
1070 
1071 	qdisc_watchdog_cancel(&q->watchdog);
1072 	if (q->qdisc)
1073 		qdisc_put(q->qdisc);
1074 	dist_free(q->delay_dist);
1075 	dist_free(q->slot_dist);
1076 }
1077 
1078 static int dump_loss_model(const struct netem_sched_data *q,
1079 			   struct sk_buff *skb)
1080 {
1081 	struct nlattr *nest;
1082 
1083 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1084 	if (nest == NULL)
1085 		goto nla_put_failure;
1086 
1087 	switch (q->loss_model) {
1088 	case CLG_RANDOM:
1089 		/* legacy loss model */
1090 		nla_nest_cancel(skb, nest);
1091 		return 0;	/* no data */
1092 
1093 	case CLG_4_STATES: {
1094 		struct tc_netem_gimodel gi = {
1095 			.p13 = q->clg.a1,
1096 			.p31 = q->clg.a2,
1097 			.p32 = q->clg.a3,
1098 			.p14 = q->clg.a4,
1099 			.p23 = q->clg.a5,
1100 		};
1101 
1102 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1103 			goto nla_put_failure;
1104 		break;
1105 	}
1106 	case CLG_GILB_ELL: {
1107 		struct tc_netem_gemodel ge = {
1108 			.p = q->clg.a1,
1109 			.r = q->clg.a2,
1110 			.h = q->clg.a3,
1111 			.k1 = q->clg.a4,
1112 		};
1113 
1114 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1115 			goto nla_put_failure;
1116 		break;
1117 	}
1118 	}
1119 
1120 	nla_nest_end(skb, nest);
1121 	return 0;
1122 
1123 nla_put_failure:
1124 	nla_nest_cancel(skb, nest);
1125 	return -1;
1126 }
1127 
1128 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1129 {
1130 	const struct netem_sched_data *q = qdisc_priv(sch);
1131 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1132 	struct tc_netem_qopt qopt;
1133 	struct tc_netem_corr cor;
1134 	struct tc_netem_reorder reorder;
1135 	struct tc_netem_corrupt corrupt;
1136 	struct tc_netem_rate rate;
1137 	struct tc_netem_slot slot;
1138 
1139 	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1140 			     UINT_MAX);
1141 	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1142 			    UINT_MAX);
1143 	qopt.limit = q->limit;
1144 	qopt.loss = q->loss;
1145 	qopt.gap = q->gap;
1146 	qopt.duplicate = q->duplicate;
1147 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1148 		goto nla_put_failure;
1149 
1150 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1151 		goto nla_put_failure;
1152 
1153 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1154 		goto nla_put_failure;
1155 
1156 	cor.delay_corr = q->delay_cor.rho;
1157 	cor.loss_corr = q->loss_cor.rho;
1158 	cor.dup_corr = q->dup_cor.rho;
1159 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1160 		goto nla_put_failure;
1161 
1162 	reorder.probability = q->reorder;
1163 	reorder.correlation = q->reorder_cor.rho;
1164 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1165 		goto nla_put_failure;
1166 
1167 	corrupt.probability = q->corrupt;
1168 	corrupt.correlation = q->corrupt_cor.rho;
1169 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1170 		goto nla_put_failure;
1171 
1172 	if (q->rate >= (1ULL << 32)) {
1173 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1174 				      TCA_NETEM_PAD))
1175 			goto nla_put_failure;
1176 		rate.rate = ~0U;
1177 	} else {
1178 		rate.rate = q->rate;
1179 	}
1180 	rate.packet_overhead = q->packet_overhead;
1181 	rate.cell_size = q->cell_size;
1182 	rate.cell_overhead = q->cell_overhead;
1183 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1184 		goto nla_put_failure;
1185 
1186 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1187 		goto nla_put_failure;
1188 
1189 	if (dump_loss_model(q, skb) != 0)
1190 		goto nla_put_failure;
1191 
1192 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1193 	    q->slot_config.dist_jitter) {
1194 		slot = q->slot_config;
1195 		if (slot.max_packets == INT_MAX)
1196 			slot.max_packets = 0;
1197 		if (slot.max_bytes == INT_MAX)
1198 			slot.max_bytes = 0;
1199 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1200 			goto nla_put_failure;
1201 	}
1202 
1203 	return nla_nest_end(skb, nla);
1204 
1205 nla_put_failure:
1206 	nlmsg_trim(skb, nla);
1207 	return -1;
1208 }
1209 
1210 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1211 			  struct sk_buff *skb, struct tcmsg *tcm)
1212 {
1213 	struct netem_sched_data *q = qdisc_priv(sch);
1214 
1215 	if (cl != 1 || !q->qdisc) 	/* only one class */
1216 		return -ENOENT;
1217 
1218 	tcm->tcm_handle |= TC_H_MIN(1);
1219 	tcm->tcm_info = q->qdisc->handle;
1220 
1221 	return 0;
1222 }
1223 
1224 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1225 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1226 {
1227 	struct netem_sched_data *q = qdisc_priv(sch);
1228 
1229 	*old = qdisc_replace(sch, new, &q->qdisc);
1230 	return 0;
1231 }
1232 
1233 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1234 {
1235 	struct netem_sched_data *q = qdisc_priv(sch);
1236 	return q->qdisc;
1237 }
1238 
1239 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1240 {
1241 	return 1;
1242 }
1243 
1244 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1245 {
1246 	if (!walker->stop) {
1247 		if (walker->count >= walker->skip)
1248 			if (walker->fn(sch, 1, walker) < 0) {
1249 				walker->stop = 1;
1250 				return;
1251 			}
1252 		walker->count++;
1253 	}
1254 }
1255 
1256 static const struct Qdisc_class_ops netem_class_ops = {
1257 	.graft		=	netem_graft,
1258 	.leaf		=	netem_leaf,
1259 	.find		=	netem_find,
1260 	.walk		=	netem_walk,
1261 	.dump		=	netem_dump_class,
1262 };
1263 
1264 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1265 	.id		=	"netem",
1266 	.cl_ops		=	&netem_class_ops,
1267 	.priv_size	=	sizeof(struct netem_sched_data),
1268 	.enqueue	=	netem_enqueue,
1269 	.dequeue	=	netem_dequeue,
1270 	.peek		=	qdisc_peek_dequeued,
1271 	.init		=	netem_init,
1272 	.reset		=	netem_reset,
1273 	.destroy	=	netem_destroy,
1274 	.change		=	netem_change,
1275 	.dump		=	netem_dump,
1276 	.owner		=	THIS_MODULE,
1277 };
1278 
1279 
1280 static int __init netem_module_init(void)
1281 {
1282 	pr_info("netem: version " VERSION "\n");
1283 	return register_qdisc(&netem_qdisc_ops);
1284 }
1285 static void __exit netem_module_exit(void)
1286 {
1287 	unregister_qdisc(&netem_qdisc_ops);
1288 }
1289 module_init(netem_module_init)
1290 module_exit(netem_module_exit)
1291 MODULE_LICENSE("GPL");
1292