xref: /openbmc/linux/net/sched/sch_netem.c (revision 62e59c4e)
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct disttable {
72 	u32  size;
73 	s16 table[0];
74 };
75 
76 struct netem_sched_data {
77 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
78 	struct rb_root t_root;
79 
80 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
81 	struct sk_buff	*t_head;
82 	struct sk_buff	*t_tail;
83 
84 	/* optional qdisc for classful handling (NULL at netem init) */
85 	struct Qdisc	*qdisc;
86 
87 	struct qdisc_watchdog watchdog;
88 
89 	s64 latency;
90 	s64 jitter;
91 
92 	u32 loss;
93 	u32 ecn;
94 	u32 limit;
95 	u32 counter;
96 	u32 gap;
97 	u32 duplicate;
98 	u32 reorder;
99 	u32 corrupt;
100 	u64 rate;
101 	s32 packet_overhead;
102 	u32 cell_size;
103 	struct reciprocal_value cell_size_reciprocal;
104 	s32 cell_overhead;
105 
106 	struct crndstate {
107 		u32 last;
108 		u32 rho;
109 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
110 
111 	struct disttable *delay_dist;
112 
113 	enum  {
114 		CLG_RANDOM,
115 		CLG_4_STATES,
116 		CLG_GILB_ELL,
117 	} loss_model;
118 
119 	enum {
120 		TX_IN_GAP_PERIOD = 1,
121 		TX_IN_BURST_PERIOD,
122 		LOST_IN_GAP_PERIOD,
123 		LOST_IN_BURST_PERIOD,
124 	} _4_state_model;
125 
126 	enum {
127 		GOOD_STATE = 1,
128 		BAD_STATE,
129 	} GE_state_model;
130 
131 	/* Correlated Loss Generation models */
132 	struct clgstate {
133 		/* state of the Markov chain */
134 		u8 state;
135 
136 		/* 4-states and Gilbert-Elliot models */
137 		u32 a1;	/* p13 for 4-states or p for GE */
138 		u32 a2;	/* p31 for 4-states or r for GE */
139 		u32 a3;	/* p32 for 4-states or h for GE */
140 		u32 a4;	/* p14 for 4-states or 1-k for GE */
141 		u32 a5; /* p23 used only in 4-states */
142 	} clg;
143 
144 	struct tc_netem_slot slot_config;
145 	struct slotstate {
146 		u64 slot_next;
147 		s32 packets_left;
148 		s32 bytes_left;
149 	} slot;
150 
151 	struct disttable *slot_dist;
152 };
153 
154 /* Time stamp put into socket buffer control block
155  * Only valid when skbs are in our internal t(ime)fifo queue.
156  *
157  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
158  * and skb->next & skb->prev are scratch space for a qdisc,
159  * we save skb->tstamp value in skb->cb[] before destroying it.
160  */
161 struct netem_skb_cb {
162 	u64	        time_to_send;
163 };
164 
165 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
166 {
167 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
168 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
169 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
170 }
171 
172 /* init_crandom - initialize correlated random number generator
173  * Use entropy source for initial seed.
174  */
175 static void init_crandom(struct crndstate *state, unsigned long rho)
176 {
177 	state->rho = rho;
178 	state->last = prandom_u32();
179 }
180 
181 /* get_crandom - correlated random number generator
182  * Next number depends on last value.
183  * rho is scaled to avoid floating point.
184  */
185 static u32 get_crandom(struct crndstate *state)
186 {
187 	u64 value, rho;
188 	unsigned long answer;
189 
190 	if (!state || state->rho == 0)	/* no correlation */
191 		return prandom_u32();
192 
193 	value = prandom_u32();
194 	rho = (u64)state->rho + 1;
195 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
196 	state->last = answer;
197 	return answer;
198 }
199 
200 /* loss_4state - 4-state model loss generator
201  * Generates losses according to the 4-state Markov chain adopted in
202  * the GI (General and Intuitive) loss model.
203  */
204 static bool loss_4state(struct netem_sched_data *q)
205 {
206 	struct clgstate *clg = &q->clg;
207 	u32 rnd = prandom_u32();
208 
209 	/*
210 	 * Makes a comparison between rnd and the transition
211 	 * probabilities outgoing from the current state, then decides the
212 	 * next state and if the next packet has to be transmitted or lost.
213 	 * The four states correspond to:
214 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
215 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
216 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
217 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
218 	 */
219 	switch (clg->state) {
220 	case TX_IN_GAP_PERIOD:
221 		if (rnd < clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
225 			clg->state = LOST_IN_GAP_PERIOD;
226 			return true;
227 		} else if (clg->a1 + clg->a4 < rnd) {
228 			clg->state = TX_IN_GAP_PERIOD;
229 		}
230 
231 		break;
232 	case TX_IN_BURST_PERIOD:
233 		if (rnd < clg->a5) {
234 			clg->state = LOST_IN_GAP_PERIOD;
235 			return true;
236 		} else {
237 			clg->state = TX_IN_BURST_PERIOD;
238 		}
239 
240 		break;
241 	case LOST_IN_GAP_PERIOD:
242 		if (rnd < clg->a3)
243 			clg->state = TX_IN_BURST_PERIOD;
244 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
245 			clg->state = TX_IN_GAP_PERIOD;
246 		} else if (clg->a2 + clg->a3 < rnd) {
247 			clg->state = LOST_IN_GAP_PERIOD;
248 			return true;
249 		}
250 		break;
251 	case LOST_IN_BURST_PERIOD:
252 		clg->state = TX_IN_GAP_PERIOD;
253 		break;
254 	}
255 
256 	return false;
257 }
258 
259 /* loss_gilb_ell - Gilbert-Elliot model loss generator
260  * Generates losses according to the Gilbert-Elliot loss model or
261  * its special cases  (Gilbert or Simple Gilbert)
262  *
263  * Makes a comparison between random number and the transition
264  * probabilities outgoing from the current state, then decides the
265  * next state. A second random number is extracted and the comparison
266  * with the loss probability of the current state decides if the next
267  * packet will be transmitted or lost.
268  */
269 static bool loss_gilb_ell(struct netem_sched_data *q)
270 {
271 	struct clgstate *clg = &q->clg;
272 
273 	switch (clg->state) {
274 	case GOOD_STATE:
275 		if (prandom_u32() < clg->a1)
276 			clg->state = BAD_STATE;
277 		if (prandom_u32() < clg->a4)
278 			return true;
279 		break;
280 	case BAD_STATE:
281 		if (prandom_u32() < clg->a2)
282 			clg->state = GOOD_STATE;
283 		if (prandom_u32() > clg->a3)
284 			return true;
285 	}
286 
287 	return false;
288 }
289 
290 static bool loss_event(struct netem_sched_data *q)
291 {
292 	switch (q->loss_model) {
293 	case CLG_RANDOM:
294 		/* Random packet drop 0 => none, ~0 => all */
295 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
296 
297 	case CLG_4_STATES:
298 		/* 4state loss model algorithm (used also for GI model)
299 		* Extracts a value from the markov 4 state loss generator,
300 		* if it is 1 drops a packet and if needed writes the event in
301 		* the kernel logs
302 		*/
303 		return loss_4state(q);
304 
305 	case CLG_GILB_ELL:
306 		/* Gilbert-Elliot loss model algorithm
307 		* Extracts a value from the Gilbert-Elliot loss generator,
308 		* if it is 1 drops a packet and if needed writes the event in
309 		* the kernel logs
310 		*/
311 		return loss_gilb_ell(q);
312 	}
313 
314 	return false;	/* not reached */
315 }
316 
317 
318 /* tabledist - return a pseudo-randomly distributed value with mean mu and
319  * std deviation sigma.  Uses table lookup to approximate the desired
320  * distribution, and a uniformly-distributed pseudo-random source.
321  */
322 static s64 tabledist(s64 mu, s32 sigma,
323 		     struct crndstate *state,
324 		     const struct disttable *dist)
325 {
326 	s64 x;
327 	long t;
328 	u32 rnd;
329 
330 	if (sigma == 0)
331 		return mu;
332 
333 	rnd = get_crandom(state);
334 
335 	/* default uniform distribution */
336 	if (dist == NULL)
337 		return ((rnd % (2 * sigma)) + mu) - sigma;
338 
339 	t = dist->table[rnd % dist->size];
340 	x = (sigma % NETEM_DIST_SCALE) * t;
341 	if (x >= 0)
342 		x += NETEM_DIST_SCALE/2;
343 	else
344 		x -= NETEM_DIST_SCALE/2;
345 
346 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
347 }
348 
349 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
350 {
351 	len += q->packet_overhead;
352 
353 	if (q->cell_size) {
354 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
355 
356 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
357 			cells++;
358 		len = cells * (q->cell_size + q->cell_overhead);
359 	}
360 
361 	return div64_u64(len * NSEC_PER_SEC, q->rate);
362 }
363 
364 static void tfifo_reset(struct Qdisc *sch)
365 {
366 	struct netem_sched_data *q = qdisc_priv(sch);
367 	struct rb_node *p = rb_first(&q->t_root);
368 
369 	while (p) {
370 		struct sk_buff *skb = rb_to_skb(p);
371 
372 		p = rb_next(p);
373 		rb_erase(&skb->rbnode, &q->t_root);
374 		rtnl_kfree_skbs(skb, skb);
375 	}
376 
377 	rtnl_kfree_skbs(q->t_head, q->t_tail);
378 	q->t_head = NULL;
379 	q->t_tail = NULL;
380 }
381 
382 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
383 {
384 	struct netem_sched_data *q = qdisc_priv(sch);
385 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
386 
387 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
388 		if (q->t_tail)
389 			q->t_tail->next = nskb;
390 		else
391 			q->t_head = nskb;
392 		q->t_tail = nskb;
393 	} else {
394 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
395 
396 		while (*p) {
397 			struct sk_buff *skb;
398 
399 			parent = *p;
400 			skb = rb_to_skb(parent);
401 			if (tnext >= netem_skb_cb(skb)->time_to_send)
402 				p = &parent->rb_right;
403 			else
404 				p = &parent->rb_left;
405 		}
406 		rb_link_node(&nskb->rbnode, parent, p);
407 		rb_insert_color(&nskb->rbnode, &q->t_root);
408 	}
409 	sch->q.qlen++;
410 }
411 
412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
413  * when we statistically choose to corrupt one, we instead segment it, returning
414  * the first packet to be corrupted, and re-enqueue the remaining frames
415  */
416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
417 				     struct sk_buff **to_free)
418 {
419 	struct sk_buff *segs;
420 	netdev_features_t features = netif_skb_features(skb);
421 
422 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
423 
424 	if (IS_ERR_OR_NULL(segs)) {
425 		qdisc_drop(skb, sch, to_free);
426 		return NULL;
427 	}
428 	consume_skb(skb);
429 	return segs;
430 }
431 
432 /*
433  * Insert one skb into qdisc.
434  * Note: parent depends on return value to account for queue length.
435  * 	NET_XMIT_DROP: queue length didn't change.
436  *      NET_XMIT_SUCCESS: one skb was queued.
437  */
438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
439 			 struct sk_buff **to_free)
440 {
441 	struct netem_sched_data *q = qdisc_priv(sch);
442 	/* We don't fill cb now as skb_unshare() may invalidate it */
443 	struct netem_skb_cb *cb;
444 	struct sk_buff *skb2;
445 	struct sk_buff *segs = NULL;
446 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
447 	int nb = 0;
448 	int count = 1;
449 	int rc = NET_XMIT_SUCCESS;
450 	int rc_drop = NET_XMIT_DROP;
451 
452 	/* Do not fool qdisc_drop_all() */
453 	skb->prev = NULL;
454 
455 	/* Random duplication */
456 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
457 		++count;
458 
459 	/* Drop packet? */
460 	if (loss_event(q)) {
461 		if (q->ecn && INET_ECN_set_ce(skb))
462 			qdisc_qstats_drop(sch); /* mark packet */
463 		else
464 			--count;
465 	}
466 	if (count == 0) {
467 		qdisc_qstats_drop(sch);
468 		__qdisc_drop(skb, to_free);
469 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
470 	}
471 
472 	/* If a delay is expected, orphan the skb. (orphaning usually takes
473 	 * place at TX completion time, so _before_ the link transit delay)
474 	 */
475 	if (q->latency || q->jitter || q->rate)
476 		skb_orphan_partial(skb);
477 
478 	/*
479 	 * If we need to duplicate packet, then re-insert at top of the
480 	 * qdisc tree, since parent queuer expects that only one
481 	 * skb will be queued.
482 	 */
483 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
484 		struct Qdisc *rootq = qdisc_root(sch);
485 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
486 
487 		q->duplicate = 0;
488 		rootq->enqueue(skb2, rootq, to_free);
489 		q->duplicate = dupsave;
490 		rc_drop = NET_XMIT_SUCCESS;
491 	}
492 
493 	/*
494 	 * Randomized packet corruption.
495 	 * Make copy if needed since we are modifying
496 	 * If packet is going to be hardware checksummed, then
497 	 * do it now in software before we mangle it.
498 	 */
499 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
500 		if (skb_is_gso(skb)) {
501 			segs = netem_segment(skb, sch, to_free);
502 			if (!segs)
503 				return rc_drop;
504 		} else {
505 			segs = skb;
506 		}
507 
508 		skb = segs;
509 		segs = segs->next;
510 
511 		skb = skb_unshare(skb, GFP_ATOMIC);
512 		if (unlikely(!skb)) {
513 			qdisc_qstats_drop(sch);
514 			goto finish_segs;
515 		}
516 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
517 		    skb_checksum_help(skb)) {
518 			qdisc_drop(skb, sch, to_free);
519 			goto finish_segs;
520 		}
521 
522 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
523 			1<<(prandom_u32() % 8);
524 	}
525 
526 	if (unlikely(sch->q.qlen >= sch->limit)) {
527 		qdisc_drop_all(skb, sch, to_free);
528 		return rc_drop;
529 	}
530 
531 	qdisc_qstats_backlog_inc(sch, skb);
532 
533 	cb = netem_skb_cb(skb);
534 	if (q->gap == 0 ||		/* not doing reordering */
535 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
536 	    q->reorder < get_crandom(&q->reorder_cor)) {
537 		u64 now;
538 		s64 delay;
539 
540 		delay = tabledist(q->latency, q->jitter,
541 				  &q->delay_cor, q->delay_dist);
542 
543 		now = ktime_get_ns();
544 
545 		if (q->rate) {
546 			struct netem_skb_cb *last = NULL;
547 
548 			if (sch->q.tail)
549 				last = netem_skb_cb(sch->q.tail);
550 			if (q->t_root.rb_node) {
551 				struct sk_buff *t_skb;
552 				struct netem_skb_cb *t_last;
553 
554 				t_skb = skb_rb_last(&q->t_root);
555 				t_last = netem_skb_cb(t_skb);
556 				if (!last ||
557 				    t_last->time_to_send > last->time_to_send)
558 					last = t_last;
559 			}
560 			if (q->t_tail) {
561 				struct netem_skb_cb *t_last =
562 					netem_skb_cb(q->t_tail);
563 
564 				if (!last ||
565 				    t_last->time_to_send > last->time_to_send)
566 					last = t_last;
567 			}
568 
569 			if (last) {
570 				/*
571 				 * Last packet in queue is reference point (now),
572 				 * calculate this time bonus and subtract
573 				 * from delay.
574 				 */
575 				delay -= last->time_to_send - now;
576 				delay = max_t(s64, 0, delay);
577 				now = last->time_to_send;
578 			}
579 
580 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
581 		}
582 
583 		cb->time_to_send = now + delay;
584 		++q->counter;
585 		tfifo_enqueue(skb, sch);
586 	} else {
587 		/*
588 		 * Do re-ordering by putting one out of N packets at the front
589 		 * of the queue.
590 		 */
591 		cb->time_to_send = ktime_get_ns();
592 		q->counter = 0;
593 
594 		__qdisc_enqueue_head(skb, &sch->q);
595 		sch->qstats.requeues++;
596 	}
597 
598 finish_segs:
599 	if (segs) {
600 		while (segs) {
601 			skb2 = segs->next;
602 			skb_mark_not_on_list(segs);
603 			qdisc_skb_cb(segs)->pkt_len = segs->len;
604 			last_len = segs->len;
605 			rc = qdisc_enqueue(segs, sch, to_free);
606 			if (rc != NET_XMIT_SUCCESS) {
607 				if (net_xmit_drop_count(rc))
608 					qdisc_qstats_drop(sch);
609 			} else {
610 				nb++;
611 				len += last_len;
612 			}
613 			segs = skb2;
614 		}
615 		sch->q.qlen += nb;
616 		if (nb > 1)
617 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
618 	}
619 	return NET_XMIT_SUCCESS;
620 }
621 
622 /* Delay the next round with a new future slot with a
623  * correct number of bytes and packets.
624  */
625 
626 static void get_slot_next(struct netem_sched_data *q, u64 now)
627 {
628 	s64 next_delay;
629 
630 	if (!q->slot_dist)
631 		next_delay = q->slot_config.min_delay +
632 				(prandom_u32() *
633 				 (q->slot_config.max_delay -
634 				  q->slot_config.min_delay) >> 32);
635 	else
636 		next_delay = tabledist(q->slot_config.dist_delay,
637 				       (s32)(q->slot_config.dist_jitter),
638 				       NULL, q->slot_dist);
639 
640 	q->slot.slot_next = now + next_delay;
641 	q->slot.packets_left = q->slot_config.max_packets;
642 	q->slot.bytes_left = q->slot_config.max_bytes;
643 }
644 
645 static struct sk_buff *netem_peek(struct netem_sched_data *q)
646 {
647 	struct sk_buff *skb = skb_rb_first(&q->t_root);
648 	u64 t1, t2;
649 
650 	if (!skb)
651 		return q->t_head;
652 	if (!q->t_head)
653 		return skb;
654 
655 	t1 = netem_skb_cb(skb)->time_to_send;
656 	t2 = netem_skb_cb(q->t_head)->time_to_send;
657 	if (t1 < t2)
658 		return skb;
659 	return q->t_head;
660 }
661 
662 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
663 {
664 	if (skb == q->t_head) {
665 		q->t_head = skb->next;
666 		if (!q->t_head)
667 			q->t_tail = NULL;
668 	} else {
669 		rb_erase(&skb->rbnode, &q->t_root);
670 	}
671 }
672 
673 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
674 {
675 	struct netem_sched_data *q = qdisc_priv(sch);
676 	struct sk_buff *skb;
677 
678 tfifo_dequeue:
679 	skb = __qdisc_dequeue_head(&sch->q);
680 	if (skb) {
681 		qdisc_qstats_backlog_dec(sch, skb);
682 deliver:
683 		qdisc_bstats_update(sch, skb);
684 		return skb;
685 	}
686 	skb = netem_peek(q);
687 	if (skb) {
688 		u64 time_to_send;
689 		u64 now = ktime_get_ns();
690 
691 		/* if more time remaining? */
692 		time_to_send = netem_skb_cb(skb)->time_to_send;
693 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
694 			get_slot_next(q, now);
695 
696 		if (time_to_send <= now && q->slot.slot_next <= now) {
697 			netem_erase_head(q, skb);
698 			sch->q.qlen--;
699 			qdisc_qstats_backlog_dec(sch, skb);
700 			skb->next = NULL;
701 			skb->prev = NULL;
702 			/* skb->dev shares skb->rbnode area,
703 			 * we need to restore its value.
704 			 */
705 			skb->dev = qdisc_dev(sch);
706 
707 			if (q->slot.slot_next) {
708 				q->slot.packets_left--;
709 				q->slot.bytes_left -= qdisc_pkt_len(skb);
710 				if (q->slot.packets_left <= 0 ||
711 				    q->slot.bytes_left <= 0)
712 					get_slot_next(q, now);
713 			}
714 
715 			if (q->qdisc) {
716 				unsigned int pkt_len = qdisc_pkt_len(skb);
717 				struct sk_buff *to_free = NULL;
718 				int err;
719 
720 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
721 				kfree_skb_list(to_free);
722 				if (err != NET_XMIT_SUCCESS &&
723 				    net_xmit_drop_count(err)) {
724 					qdisc_qstats_drop(sch);
725 					qdisc_tree_reduce_backlog(sch, 1,
726 								  pkt_len);
727 				}
728 				goto tfifo_dequeue;
729 			}
730 			goto deliver;
731 		}
732 
733 		if (q->qdisc) {
734 			skb = q->qdisc->ops->dequeue(q->qdisc);
735 			if (skb)
736 				goto deliver;
737 		}
738 
739 		qdisc_watchdog_schedule_ns(&q->watchdog,
740 					   max(time_to_send,
741 					       q->slot.slot_next));
742 	}
743 
744 	if (q->qdisc) {
745 		skb = q->qdisc->ops->dequeue(q->qdisc);
746 		if (skb)
747 			goto deliver;
748 	}
749 	return NULL;
750 }
751 
752 static void netem_reset(struct Qdisc *sch)
753 {
754 	struct netem_sched_data *q = qdisc_priv(sch);
755 
756 	qdisc_reset_queue(sch);
757 	tfifo_reset(sch);
758 	if (q->qdisc)
759 		qdisc_reset(q->qdisc);
760 	qdisc_watchdog_cancel(&q->watchdog);
761 }
762 
763 static void dist_free(struct disttable *d)
764 {
765 	kvfree(d);
766 }
767 
768 /*
769  * Distribution data is a variable size payload containing
770  * signed 16 bit values.
771  */
772 
773 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
774 			  const struct nlattr *attr)
775 {
776 	size_t n = nla_len(attr)/sizeof(__s16);
777 	const __s16 *data = nla_data(attr);
778 	spinlock_t *root_lock;
779 	struct disttable *d;
780 	int i;
781 
782 	if (n > NETEM_DIST_MAX)
783 		return -EINVAL;
784 
785 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
786 	if (!d)
787 		return -ENOMEM;
788 
789 	d->size = n;
790 	for (i = 0; i < n; i++)
791 		d->table[i] = data[i];
792 
793 	root_lock = qdisc_root_sleeping_lock(sch);
794 
795 	spin_lock_bh(root_lock);
796 	swap(*tbl, d);
797 	spin_unlock_bh(root_lock);
798 
799 	dist_free(d);
800 	return 0;
801 }
802 
803 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
804 {
805 	const struct tc_netem_slot *c = nla_data(attr);
806 
807 	q->slot_config = *c;
808 	if (q->slot_config.max_packets == 0)
809 		q->slot_config.max_packets = INT_MAX;
810 	if (q->slot_config.max_bytes == 0)
811 		q->slot_config.max_bytes = INT_MAX;
812 	q->slot.packets_left = q->slot_config.max_packets;
813 	q->slot.bytes_left = q->slot_config.max_bytes;
814 	if (q->slot_config.min_delay | q->slot_config.max_delay |
815 	    q->slot_config.dist_jitter)
816 		q->slot.slot_next = ktime_get_ns();
817 	else
818 		q->slot.slot_next = 0;
819 }
820 
821 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
822 {
823 	const struct tc_netem_corr *c = nla_data(attr);
824 
825 	init_crandom(&q->delay_cor, c->delay_corr);
826 	init_crandom(&q->loss_cor, c->loss_corr);
827 	init_crandom(&q->dup_cor, c->dup_corr);
828 }
829 
830 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
831 {
832 	const struct tc_netem_reorder *r = nla_data(attr);
833 
834 	q->reorder = r->probability;
835 	init_crandom(&q->reorder_cor, r->correlation);
836 }
837 
838 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 	const struct tc_netem_corrupt *r = nla_data(attr);
841 
842 	q->corrupt = r->probability;
843 	init_crandom(&q->corrupt_cor, r->correlation);
844 }
845 
846 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 	const struct tc_netem_rate *r = nla_data(attr);
849 
850 	q->rate = r->rate;
851 	q->packet_overhead = r->packet_overhead;
852 	q->cell_size = r->cell_size;
853 	q->cell_overhead = r->cell_overhead;
854 	if (q->cell_size)
855 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
856 	else
857 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
858 }
859 
860 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
861 {
862 	const struct nlattr *la;
863 	int rem;
864 
865 	nla_for_each_nested(la, attr, rem) {
866 		u16 type = nla_type(la);
867 
868 		switch (type) {
869 		case NETEM_LOSS_GI: {
870 			const struct tc_netem_gimodel *gi = nla_data(la);
871 
872 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
873 				pr_info("netem: incorrect gi model size\n");
874 				return -EINVAL;
875 			}
876 
877 			q->loss_model = CLG_4_STATES;
878 
879 			q->clg.state = TX_IN_GAP_PERIOD;
880 			q->clg.a1 = gi->p13;
881 			q->clg.a2 = gi->p31;
882 			q->clg.a3 = gi->p32;
883 			q->clg.a4 = gi->p14;
884 			q->clg.a5 = gi->p23;
885 			break;
886 		}
887 
888 		case NETEM_LOSS_GE: {
889 			const struct tc_netem_gemodel *ge = nla_data(la);
890 
891 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
892 				pr_info("netem: incorrect ge model size\n");
893 				return -EINVAL;
894 			}
895 
896 			q->loss_model = CLG_GILB_ELL;
897 			q->clg.state = GOOD_STATE;
898 			q->clg.a1 = ge->p;
899 			q->clg.a2 = ge->r;
900 			q->clg.a3 = ge->h;
901 			q->clg.a4 = ge->k1;
902 			break;
903 		}
904 
905 		default:
906 			pr_info("netem: unknown loss type %u\n", type);
907 			return -EINVAL;
908 		}
909 	}
910 
911 	return 0;
912 }
913 
914 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
915 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
916 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
917 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
918 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
919 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
920 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
921 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
922 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
923 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
924 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
925 };
926 
927 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
928 		      const struct nla_policy *policy, int len)
929 {
930 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
931 
932 	if (nested_len < 0) {
933 		pr_info("netem: invalid attributes len %d\n", nested_len);
934 		return -EINVAL;
935 	}
936 
937 	if (nested_len >= nla_attr_size(0))
938 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
939 				 nested_len, policy, NULL);
940 
941 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
942 	return 0;
943 }
944 
945 /* Parse netlink message to set options */
946 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
947 			struct netlink_ext_ack *extack)
948 {
949 	struct netem_sched_data *q = qdisc_priv(sch);
950 	struct nlattr *tb[TCA_NETEM_MAX + 1];
951 	struct tc_netem_qopt *qopt;
952 	struct clgstate old_clg;
953 	int old_loss_model = CLG_RANDOM;
954 	int ret;
955 
956 	if (opt == NULL)
957 		return -EINVAL;
958 
959 	qopt = nla_data(opt);
960 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
961 	if (ret < 0)
962 		return ret;
963 
964 	/* backup q->clg and q->loss_model */
965 	old_clg = q->clg;
966 	old_loss_model = q->loss_model;
967 
968 	if (tb[TCA_NETEM_LOSS]) {
969 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
970 		if (ret) {
971 			q->loss_model = old_loss_model;
972 			return ret;
973 		}
974 	} else {
975 		q->loss_model = CLG_RANDOM;
976 	}
977 
978 	if (tb[TCA_NETEM_DELAY_DIST]) {
979 		ret = get_dist_table(sch, &q->delay_dist,
980 				     tb[TCA_NETEM_DELAY_DIST]);
981 		if (ret)
982 			goto get_table_failure;
983 	}
984 
985 	if (tb[TCA_NETEM_SLOT_DIST]) {
986 		ret = get_dist_table(sch, &q->slot_dist,
987 				     tb[TCA_NETEM_SLOT_DIST]);
988 		if (ret)
989 			goto get_table_failure;
990 	}
991 
992 	sch->limit = qopt->limit;
993 
994 	q->latency = PSCHED_TICKS2NS(qopt->latency);
995 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
996 	q->limit = qopt->limit;
997 	q->gap = qopt->gap;
998 	q->counter = 0;
999 	q->loss = qopt->loss;
1000 	q->duplicate = qopt->duplicate;
1001 
1002 	/* for compatibility with earlier versions.
1003 	 * if gap is set, need to assume 100% probability
1004 	 */
1005 	if (q->gap)
1006 		q->reorder = ~0;
1007 
1008 	if (tb[TCA_NETEM_CORR])
1009 		get_correlation(q, tb[TCA_NETEM_CORR]);
1010 
1011 	if (tb[TCA_NETEM_REORDER])
1012 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1013 
1014 	if (tb[TCA_NETEM_CORRUPT])
1015 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1016 
1017 	if (tb[TCA_NETEM_RATE])
1018 		get_rate(q, tb[TCA_NETEM_RATE]);
1019 
1020 	if (tb[TCA_NETEM_RATE64])
1021 		q->rate = max_t(u64, q->rate,
1022 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1023 
1024 	if (tb[TCA_NETEM_LATENCY64])
1025 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1026 
1027 	if (tb[TCA_NETEM_JITTER64])
1028 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1029 
1030 	if (tb[TCA_NETEM_ECN])
1031 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1032 
1033 	if (tb[TCA_NETEM_SLOT])
1034 		get_slot(q, tb[TCA_NETEM_SLOT]);
1035 
1036 	return ret;
1037 
1038 get_table_failure:
1039 	/* recover clg and loss_model, in case of
1040 	 * q->clg and q->loss_model were modified
1041 	 * in get_loss_clg()
1042 	 */
1043 	q->clg = old_clg;
1044 	q->loss_model = old_loss_model;
1045 	return ret;
1046 }
1047 
1048 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1049 		      struct netlink_ext_ack *extack)
1050 {
1051 	struct netem_sched_data *q = qdisc_priv(sch);
1052 	int ret;
1053 
1054 	qdisc_watchdog_init(&q->watchdog, sch);
1055 
1056 	if (!opt)
1057 		return -EINVAL;
1058 
1059 	q->loss_model = CLG_RANDOM;
1060 	ret = netem_change(sch, opt, extack);
1061 	if (ret)
1062 		pr_info("netem: change failed\n");
1063 	return ret;
1064 }
1065 
1066 static void netem_destroy(struct Qdisc *sch)
1067 {
1068 	struct netem_sched_data *q = qdisc_priv(sch);
1069 
1070 	qdisc_watchdog_cancel(&q->watchdog);
1071 	if (q->qdisc)
1072 		qdisc_put(q->qdisc);
1073 	dist_free(q->delay_dist);
1074 	dist_free(q->slot_dist);
1075 }
1076 
1077 static int dump_loss_model(const struct netem_sched_data *q,
1078 			   struct sk_buff *skb)
1079 {
1080 	struct nlattr *nest;
1081 
1082 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1083 	if (nest == NULL)
1084 		goto nla_put_failure;
1085 
1086 	switch (q->loss_model) {
1087 	case CLG_RANDOM:
1088 		/* legacy loss model */
1089 		nla_nest_cancel(skb, nest);
1090 		return 0;	/* no data */
1091 
1092 	case CLG_4_STATES: {
1093 		struct tc_netem_gimodel gi = {
1094 			.p13 = q->clg.a1,
1095 			.p31 = q->clg.a2,
1096 			.p32 = q->clg.a3,
1097 			.p14 = q->clg.a4,
1098 			.p23 = q->clg.a5,
1099 		};
1100 
1101 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1102 			goto nla_put_failure;
1103 		break;
1104 	}
1105 	case CLG_GILB_ELL: {
1106 		struct tc_netem_gemodel ge = {
1107 			.p = q->clg.a1,
1108 			.r = q->clg.a2,
1109 			.h = q->clg.a3,
1110 			.k1 = q->clg.a4,
1111 		};
1112 
1113 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1114 			goto nla_put_failure;
1115 		break;
1116 	}
1117 	}
1118 
1119 	nla_nest_end(skb, nest);
1120 	return 0;
1121 
1122 nla_put_failure:
1123 	nla_nest_cancel(skb, nest);
1124 	return -1;
1125 }
1126 
1127 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1128 {
1129 	const struct netem_sched_data *q = qdisc_priv(sch);
1130 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1131 	struct tc_netem_qopt qopt;
1132 	struct tc_netem_corr cor;
1133 	struct tc_netem_reorder reorder;
1134 	struct tc_netem_corrupt corrupt;
1135 	struct tc_netem_rate rate;
1136 	struct tc_netem_slot slot;
1137 
1138 	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1139 			     UINT_MAX);
1140 	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1141 			    UINT_MAX);
1142 	qopt.limit = q->limit;
1143 	qopt.loss = q->loss;
1144 	qopt.gap = q->gap;
1145 	qopt.duplicate = q->duplicate;
1146 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1147 		goto nla_put_failure;
1148 
1149 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1150 		goto nla_put_failure;
1151 
1152 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1153 		goto nla_put_failure;
1154 
1155 	cor.delay_corr = q->delay_cor.rho;
1156 	cor.loss_corr = q->loss_cor.rho;
1157 	cor.dup_corr = q->dup_cor.rho;
1158 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1159 		goto nla_put_failure;
1160 
1161 	reorder.probability = q->reorder;
1162 	reorder.correlation = q->reorder_cor.rho;
1163 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1164 		goto nla_put_failure;
1165 
1166 	corrupt.probability = q->corrupt;
1167 	corrupt.correlation = q->corrupt_cor.rho;
1168 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1169 		goto nla_put_failure;
1170 
1171 	if (q->rate >= (1ULL << 32)) {
1172 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1173 				      TCA_NETEM_PAD))
1174 			goto nla_put_failure;
1175 		rate.rate = ~0U;
1176 	} else {
1177 		rate.rate = q->rate;
1178 	}
1179 	rate.packet_overhead = q->packet_overhead;
1180 	rate.cell_size = q->cell_size;
1181 	rate.cell_overhead = q->cell_overhead;
1182 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1183 		goto nla_put_failure;
1184 
1185 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1186 		goto nla_put_failure;
1187 
1188 	if (dump_loss_model(q, skb) != 0)
1189 		goto nla_put_failure;
1190 
1191 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1192 	    q->slot_config.dist_jitter) {
1193 		slot = q->slot_config;
1194 		if (slot.max_packets == INT_MAX)
1195 			slot.max_packets = 0;
1196 		if (slot.max_bytes == INT_MAX)
1197 			slot.max_bytes = 0;
1198 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1199 			goto nla_put_failure;
1200 	}
1201 
1202 	return nla_nest_end(skb, nla);
1203 
1204 nla_put_failure:
1205 	nlmsg_trim(skb, nla);
1206 	return -1;
1207 }
1208 
1209 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1210 			  struct sk_buff *skb, struct tcmsg *tcm)
1211 {
1212 	struct netem_sched_data *q = qdisc_priv(sch);
1213 
1214 	if (cl != 1 || !q->qdisc) 	/* only one class */
1215 		return -ENOENT;
1216 
1217 	tcm->tcm_handle |= TC_H_MIN(1);
1218 	tcm->tcm_info = q->qdisc->handle;
1219 
1220 	return 0;
1221 }
1222 
1223 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1224 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1225 {
1226 	struct netem_sched_data *q = qdisc_priv(sch);
1227 
1228 	*old = qdisc_replace(sch, new, &q->qdisc);
1229 	return 0;
1230 }
1231 
1232 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1233 {
1234 	struct netem_sched_data *q = qdisc_priv(sch);
1235 	return q->qdisc;
1236 }
1237 
1238 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1239 {
1240 	return 1;
1241 }
1242 
1243 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1244 {
1245 	if (!walker->stop) {
1246 		if (walker->count >= walker->skip)
1247 			if (walker->fn(sch, 1, walker) < 0) {
1248 				walker->stop = 1;
1249 				return;
1250 			}
1251 		walker->count++;
1252 	}
1253 }
1254 
1255 static const struct Qdisc_class_ops netem_class_ops = {
1256 	.graft		=	netem_graft,
1257 	.leaf		=	netem_leaf,
1258 	.find		=	netem_find,
1259 	.walk		=	netem_walk,
1260 	.dump		=	netem_dump_class,
1261 };
1262 
1263 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1264 	.id		=	"netem",
1265 	.cl_ops		=	&netem_class_ops,
1266 	.priv_size	=	sizeof(struct netem_sched_data),
1267 	.enqueue	=	netem_enqueue,
1268 	.dequeue	=	netem_dequeue,
1269 	.peek		=	qdisc_peek_dequeued,
1270 	.init		=	netem_init,
1271 	.reset		=	netem_reset,
1272 	.destroy	=	netem_destroy,
1273 	.change		=	netem_change,
1274 	.dump		=	netem_dump,
1275 	.owner		=	THIS_MODULE,
1276 };
1277 
1278 
1279 static int __init netem_module_init(void)
1280 {
1281 	pr_info("netem: version " VERSION "\n");
1282 	return register_qdisc(&netem_qdisc_ops);
1283 }
1284 static void __exit netem_module_exit(void)
1285 {
1286 	unregister_qdisc(&netem_qdisc_ops);
1287 }
1288 module_init(netem_module_init)
1289 module_exit(netem_module_exit)
1290 MODULE_LICENSE("GPL");
1291