xref: /openbmc/linux/net/sched/sch_netem.c (revision 2b1b838ea8e5437ef06a29818d16e9efdfaf0037)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_netem.c	Network emulator
4  *
5  *  		Many of the algorithms and ideas for this came from
6  *		NIST Net which is not copyrighted.
7  *
8  * Authors:	Stephen Hemminger <shemminger@osdl.org>
9  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10  */
11 
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/rtnetlink.h>
21 #include <linux/reciprocal_div.h>
22 #include <linux/rbtree.h>
23 
24 #include <net/gso.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 #include <net/inet_ecn.h>
28 
29 #define VERSION "1.3"
30 
31 /*	Network Emulation Queuing algorithm.
32 	====================================
33 
34 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 		 Network Emulation Tool
36 		 [2] Luigi Rizzo, DummyNet for FreeBSD
37 
38 	 ----------------------------------------------------------------
39 
40 	 This started out as a simple way to delay outgoing packets to
41 	 test TCP but has grown to include most of the functionality
42 	 of a full blown network emulator like NISTnet. It can delay
43 	 packets and add random jitter (and correlation). The random
44 	 distribution can be loaded from a table as well to provide
45 	 normal, Pareto, or experimental curves. Packet loss,
46 	 duplication, and reordering can also be emulated.
47 
48 	 This qdisc does not do classification that can be handled in
49 	 layering other disciplines.  It does not need to do bandwidth
50 	 control either since that can be handled by using token
51 	 bucket or other rate control.
52 
53      Correlated Loss Generator models
54 
55 	Added generation of correlated loss according to the
56 	"Gilbert-Elliot" model, a 4-state markov model.
57 
58 	References:
59 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
60 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
61 	and intuitive loss model for packet networks and its implementation
62 	in the Netem module in the Linux kernel", available in [1]
63 
64 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
65 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
66 */
67 
68 struct disttable {
69 	u32  size;
70 	s16 table[];
71 };
72 
73 struct netem_sched_data {
74 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
75 	struct rb_root t_root;
76 
77 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
78 	struct sk_buff	*t_head;
79 	struct sk_buff	*t_tail;
80 
81 	/* optional qdisc for classful handling (NULL at netem init) */
82 	struct Qdisc	*qdisc;
83 
84 	struct qdisc_watchdog watchdog;
85 
86 	s64 latency;
87 	s64 jitter;
88 
89 	u32 loss;
90 	u32 ecn;
91 	u32 limit;
92 	u32 counter;
93 	u32 gap;
94 	u32 duplicate;
95 	u32 reorder;
96 	u32 corrupt;
97 	u64 rate;
98 	s32 packet_overhead;
99 	u32 cell_size;
100 	struct reciprocal_value cell_size_reciprocal;
101 	s32 cell_overhead;
102 
103 	struct crndstate {
104 		u32 last;
105 		u32 rho;
106 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
107 
108 	struct disttable *delay_dist;
109 
110 	enum  {
111 		CLG_RANDOM,
112 		CLG_4_STATES,
113 		CLG_GILB_ELL,
114 	} loss_model;
115 
116 	enum {
117 		TX_IN_GAP_PERIOD = 1,
118 		TX_IN_BURST_PERIOD,
119 		LOST_IN_GAP_PERIOD,
120 		LOST_IN_BURST_PERIOD,
121 	} _4_state_model;
122 
123 	enum {
124 		GOOD_STATE = 1,
125 		BAD_STATE,
126 	} GE_state_model;
127 
128 	/* Correlated Loss Generation models */
129 	struct clgstate {
130 		/* state of the Markov chain */
131 		u8 state;
132 
133 		/* 4-states and Gilbert-Elliot models */
134 		u32 a1;	/* p13 for 4-states or p for GE */
135 		u32 a2;	/* p31 for 4-states or r for GE */
136 		u32 a3;	/* p32 for 4-states or h for GE */
137 		u32 a4;	/* p14 for 4-states or 1-k for GE */
138 		u32 a5; /* p23 used only in 4-states */
139 	} clg;
140 
141 	struct tc_netem_slot slot_config;
142 	struct slotstate {
143 		u64 slot_next;
144 		s32 packets_left;
145 		s32 bytes_left;
146 	} slot;
147 
148 	struct disttable *slot_dist;
149 };
150 
151 /* Time stamp put into socket buffer control block
152  * Only valid when skbs are in our internal t(ime)fifo queue.
153  *
154  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
155  * and skb->next & skb->prev are scratch space for a qdisc,
156  * we save skb->tstamp value in skb->cb[] before destroying it.
157  */
158 struct netem_skb_cb {
159 	u64	        time_to_send;
160 };
161 
162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
163 {
164 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
165 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
166 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
167 }
168 
169 /* init_crandom - initialize correlated random number generator
170  * Use entropy source for initial seed.
171  */
172 static void init_crandom(struct crndstate *state, unsigned long rho)
173 {
174 	state->rho = rho;
175 	state->last = get_random_u32();
176 }
177 
178 /* get_crandom - correlated random number generator
179  * Next number depends on last value.
180  * rho is scaled to avoid floating point.
181  */
182 static u32 get_crandom(struct crndstate *state)
183 {
184 	u64 value, rho;
185 	unsigned long answer;
186 
187 	if (!state || state->rho == 0)	/* no correlation */
188 		return get_random_u32();
189 
190 	value = get_random_u32();
191 	rho = (u64)state->rho + 1;
192 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
193 	state->last = answer;
194 	return answer;
195 }
196 
197 /* loss_4state - 4-state model loss generator
198  * Generates losses according to the 4-state Markov chain adopted in
199  * the GI (General and Intuitive) loss model.
200  */
201 static bool loss_4state(struct netem_sched_data *q)
202 {
203 	struct clgstate *clg = &q->clg;
204 	u32 rnd = get_random_u32();
205 
206 	/*
207 	 * Makes a comparison between rnd and the transition
208 	 * probabilities outgoing from the current state, then decides the
209 	 * next state and if the next packet has to be transmitted or lost.
210 	 * The four states correspond to:
211 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
212 	 *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
213 	 *   LOST_IN_BURST_PERIOD => lost packets within a burst period
214 	 *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
215 	 */
216 	switch (clg->state) {
217 	case TX_IN_GAP_PERIOD:
218 		if (rnd < clg->a4) {
219 			clg->state = LOST_IN_GAP_PERIOD;
220 			return true;
221 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a1 + clg->a4 < rnd) {
225 			clg->state = TX_IN_GAP_PERIOD;
226 		}
227 
228 		break;
229 	case TX_IN_BURST_PERIOD:
230 		if (rnd < clg->a5) {
231 			clg->state = LOST_IN_BURST_PERIOD;
232 			return true;
233 		} else {
234 			clg->state = TX_IN_BURST_PERIOD;
235 		}
236 
237 		break;
238 	case LOST_IN_BURST_PERIOD:
239 		if (rnd < clg->a3)
240 			clg->state = TX_IN_BURST_PERIOD;
241 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
242 			clg->state = TX_IN_GAP_PERIOD;
243 		} else if (clg->a2 + clg->a3 < rnd) {
244 			clg->state = LOST_IN_BURST_PERIOD;
245 			return true;
246 		}
247 		break;
248 	case LOST_IN_GAP_PERIOD:
249 		clg->state = TX_IN_GAP_PERIOD;
250 		break;
251 	}
252 
253 	return false;
254 }
255 
256 /* loss_gilb_ell - Gilbert-Elliot model loss generator
257  * Generates losses according to the Gilbert-Elliot loss model or
258  * its special cases  (Gilbert or Simple Gilbert)
259  *
260  * Makes a comparison between random number and the transition
261  * probabilities outgoing from the current state, then decides the
262  * next state. A second random number is extracted and the comparison
263  * with the loss probability of the current state decides if the next
264  * packet will be transmitted or lost.
265  */
266 static bool loss_gilb_ell(struct netem_sched_data *q)
267 {
268 	struct clgstate *clg = &q->clg;
269 
270 	switch (clg->state) {
271 	case GOOD_STATE:
272 		if (get_random_u32() < clg->a1)
273 			clg->state = BAD_STATE;
274 		if (get_random_u32() < clg->a4)
275 			return true;
276 		break;
277 	case BAD_STATE:
278 		if (get_random_u32() < clg->a2)
279 			clg->state = GOOD_STATE;
280 		if (get_random_u32() > clg->a3)
281 			return true;
282 	}
283 
284 	return false;
285 }
286 
287 static bool loss_event(struct netem_sched_data *q)
288 {
289 	switch (q->loss_model) {
290 	case CLG_RANDOM:
291 		/* Random packet drop 0 => none, ~0 => all */
292 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
293 
294 	case CLG_4_STATES:
295 		/* 4state loss model algorithm (used also for GI model)
296 		* Extracts a value from the markov 4 state loss generator,
297 		* if it is 1 drops a packet and if needed writes the event in
298 		* the kernel logs
299 		*/
300 		return loss_4state(q);
301 
302 	case CLG_GILB_ELL:
303 		/* Gilbert-Elliot loss model algorithm
304 		* Extracts a value from the Gilbert-Elliot loss generator,
305 		* if it is 1 drops a packet and if needed writes the event in
306 		* the kernel logs
307 		*/
308 		return loss_gilb_ell(q);
309 	}
310 
311 	return false;	/* not reached */
312 }
313 
314 
315 /* tabledist - return a pseudo-randomly distributed value with mean mu and
316  * std deviation sigma.  Uses table lookup to approximate the desired
317  * distribution, and a uniformly-distributed pseudo-random source.
318  */
319 static s64 tabledist(s64 mu, s32 sigma,
320 		     struct crndstate *state,
321 		     const struct disttable *dist)
322 {
323 	s64 x;
324 	long t;
325 	u32 rnd;
326 
327 	if (sigma == 0)
328 		return mu;
329 
330 	rnd = get_crandom(state);
331 
332 	/* default uniform distribution */
333 	if (dist == NULL)
334 		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
335 
336 	t = dist->table[rnd % dist->size];
337 	x = (sigma % NETEM_DIST_SCALE) * t;
338 	if (x >= 0)
339 		x += NETEM_DIST_SCALE/2;
340 	else
341 		x -= NETEM_DIST_SCALE/2;
342 
343 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
344 }
345 
346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
347 {
348 	len += q->packet_overhead;
349 
350 	if (q->cell_size) {
351 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
352 
353 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
354 			cells++;
355 		len = cells * (q->cell_size + q->cell_overhead);
356 	}
357 
358 	return div64_u64(len * NSEC_PER_SEC, q->rate);
359 }
360 
361 static void tfifo_reset(struct Qdisc *sch)
362 {
363 	struct netem_sched_data *q = qdisc_priv(sch);
364 	struct rb_node *p = rb_first(&q->t_root);
365 
366 	while (p) {
367 		struct sk_buff *skb = rb_to_skb(p);
368 
369 		p = rb_next(p);
370 		rb_erase(&skb->rbnode, &q->t_root);
371 		rtnl_kfree_skbs(skb, skb);
372 	}
373 
374 	rtnl_kfree_skbs(q->t_head, q->t_tail);
375 	q->t_head = NULL;
376 	q->t_tail = NULL;
377 }
378 
379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
380 {
381 	struct netem_sched_data *q = qdisc_priv(sch);
382 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
383 
384 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
385 		if (q->t_tail)
386 			q->t_tail->next = nskb;
387 		else
388 			q->t_head = nskb;
389 		q->t_tail = nskb;
390 	} else {
391 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
392 
393 		while (*p) {
394 			struct sk_buff *skb;
395 
396 			parent = *p;
397 			skb = rb_to_skb(parent);
398 			if (tnext >= netem_skb_cb(skb)->time_to_send)
399 				p = &parent->rb_right;
400 			else
401 				p = &parent->rb_left;
402 		}
403 		rb_link_node(&nskb->rbnode, parent, p);
404 		rb_insert_color(&nskb->rbnode, &q->t_root);
405 	}
406 	sch->q.qlen++;
407 }
408 
409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
410  * when we statistically choose to corrupt one, we instead segment it, returning
411  * the first packet to be corrupted, and re-enqueue the remaining frames
412  */
413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
414 				     struct sk_buff **to_free)
415 {
416 	struct sk_buff *segs;
417 	netdev_features_t features = netif_skb_features(skb);
418 
419 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
420 
421 	if (IS_ERR_OR_NULL(segs)) {
422 		qdisc_drop(skb, sch, to_free);
423 		return NULL;
424 	}
425 	consume_skb(skb);
426 	return segs;
427 }
428 
429 /*
430  * Insert one skb into qdisc.
431  * Note: parent depends on return value to account for queue length.
432  * 	NET_XMIT_DROP: queue length didn't change.
433  *      NET_XMIT_SUCCESS: one skb was queued.
434  */
435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
436 			 struct sk_buff **to_free)
437 {
438 	struct netem_sched_data *q = qdisc_priv(sch);
439 	/* We don't fill cb now as skb_unshare() may invalidate it */
440 	struct netem_skb_cb *cb;
441 	struct sk_buff *skb2;
442 	struct sk_buff *segs = NULL;
443 	unsigned int prev_len = qdisc_pkt_len(skb);
444 	int count = 1;
445 	int rc = NET_XMIT_SUCCESS;
446 	int rc_drop = NET_XMIT_DROP;
447 
448 	/* Do not fool qdisc_drop_all() */
449 	skb->prev = NULL;
450 
451 	/* Random duplication */
452 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
453 		++count;
454 
455 	/* Drop packet? */
456 	if (loss_event(q)) {
457 		if (q->ecn && INET_ECN_set_ce(skb))
458 			qdisc_qstats_drop(sch); /* mark packet */
459 		else
460 			--count;
461 	}
462 	if (count == 0) {
463 		qdisc_qstats_drop(sch);
464 		__qdisc_drop(skb, to_free);
465 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
466 	}
467 
468 	/* If a delay is expected, orphan the skb. (orphaning usually takes
469 	 * place at TX completion time, so _before_ the link transit delay)
470 	 */
471 	if (q->latency || q->jitter || q->rate)
472 		skb_orphan_partial(skb);
473 
474 	/*
475 	 * If we need to duplicate packet, then re-insert at top of the
476 	 * qdisc tree, since parent queuer expects that only one
477 	 * skb will be queued.
478 	 */
479 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
480 		struct Qdisc *rootq = qdisc_root_bh(sch);
481 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
482 
483 		q->duplicate = 0;
484 		rootq->enqueue(skb2, rootq, to_free);
485 		q->duplicate = dupsave;
486 		rc_drop = NET_XMIT_SUCCESS;
487 	}
488 
489 	/*
490 	 * Randomized packet corruption.
491 	 * Make copy if needed since we are modifying
492 	 * If packet is going to be hardware checksummed, then
493 	 * do it now in software before we mangle it.
494 	 */
495 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
496 		if (skb_is_gso(skb)) {
497 			skb = netem_segment(skb, sch, to_free);
498 			if (!skb)
499 				return rc_drop;
500 			segs = skb->next;
501 			skb_mark_not_on_list(skb);
502 			qdisc_skb_cb(skb)->pkt_len = skb->len;
503 		}
504 
505 		skb = skb_unshare(skb, GFP_ATOMIC);
506 		if (unlikely(!skb)) {
507 			qdisc_qstats_drop(sch);
508 			goto finish_segs;
509 		}
510 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
511 		    skb_checksum_help(skb)) {
512 			qdisc_drop(skb, sch, to_free);
513 			skb = NULL;
514 			goto finish_segs;
515 		}
516 
517 		skb->data[get_random_u32_below(skb_headlen(skb))] ^=
518 			1<<get_random_u32_below(8);
519 	}
520 
521 	if (unlikely(sch->q.qlen >= sch->limit)) {
522 		/* re-link segs, so that qdisc_drop_all() frees them all */
523 		skb->next = segs;
524 		qdisc_drop_all(skb, sch, to_free);
525 		return rc_drop;
526 	}
527 
528 	qdisc_qstats_backlog_inc(sch, skb);
529 
530 	cb = netem_skb_cb(skb);
531 	if (q->gap == 0 ||		/* not doing reordering */
532 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
533 	    q->reorder < get_crandom(&q->reorder_cor)) {
534 		u64 now;
535 		s64 delay;
536 
537 		delay = tabledist(q->latency, q->jitter,
538 				  &q->delay_cor, q->delay_dist);
539 
540 		now = ktime_get_ns();
541 
542 		if (q->rate) {
543 			struct netem_skb_cb *last = NULL;
544 
545 			if (sch->q.tail)
546 				last = netem_skb_cb(sch->q.tail);
547 			if (q->t_root.rb_node) {
548 				struct sk_buff *t_skb;
549 				struct netem_skb_cb *t_last;
550 
551 				t_skb = skb_rb_last(&q->t_root);
552 				t_last = netem_skb_cb(t_skb);
553 				if (!last ||
554 				    t_last->time_to_send > last->time_to_send)
555 					last = t_last;
556 			}
557 			if (q->t_tail) {
558 				struct netem_skb_cb *t_last =
559 					netem_skb_cb(q->t_tail);
560 
561 				if (!last ||
562 				    t_last->time_to_send > last->time_to_send)
563 					last = t_last;
564 			}
565 
566 			if (last) {
567 				/*
568 				 * Last packet in queue is reference point (now),
569 				 * calculate this time bonus and subtract
570 				 * from delay.
571 				 */
572 				delay -= last->time_to_send - now;
573 				delay = max_t(s64, 0, delay);
574 				now = last->time_to_send;
575 			}
576 
577 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
578 		}
579 
580 		cb->time_to_send = now + delay;
581 		++q->counter;
582 		tfifo_enqueue(skb, sch);
583 	} else {
584 		/*
585 		 * Do re-ordering by putting one out of N packets at the front
586 		 * of the queue.
587 		 */
588 		cb->time_to_send = ktime_get_ns();
589 		q->counter = 0;
590 
591 		__qdisc_enqueue_head(skb, &sch->q);
592 		sch->qstats.requeues++;
593 	}
594 
595 finish_segs:
596 	if (segs) {
597 		unsigned int len, last_len;
598 		int nb;
599 
600 		len = skb ? skb->len : 0;
601 		nb = skb ? 1 : 0;
602 
603 		while (segs) {
604 			skb2 = segs->next;
605 			skb_mark_not_on_list(segs);
606 			qdisc_skb_cb(segs)->pkt_len = segs->len;
607 			last_len = segs->len;
608 			rc = qdisc_enqueue(segs, sch, to_free);
609 			if (rc != NET_XMIT_SUCCESS) {
610 				if (net_xmit_drop_count(rc))
611 					qdisc_qstats_drop(sch);
612 			} else {
613 				nb++;
614 				len += last_len;
615 			}
616 			segs = skb2;
617 		}
618 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
619 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
620 	} else if (!skb) {
621 		return NET_XMIT_DROP;
622 	}
623 	return NET_XMIT_SUCCESS;
624 }
625 
626 /* Delay the next round with a new future slot with a
627  * correct number of bytes and packets.
628  */
629 
630 static void get_slot_next(struct netem_sched_data *q, u64 now)
631 {
632 	s64 next_delay;
633 
634 	if (!q->slot_dist)
635 		next_delay = q->slot_config.min_delay +
636 				(get_random_u32() *
637 				 (q->slot_config.max_delay -
638 				  q->slot_config.min_delay) >> 32);
639 	else
640 		next_delay = tabledist(q->slot_config.dist_delay,
641 				       (s32)(q->slot_config.dist_jitter),
642 				       NULL, q->slot_dist);
643 
644 	q->slot.slot_next = now + next_delay;
645 	q->slot.packets_left = q->slot_config.max_packets;
646 	q->slot.bytes_left = q->slot_config.max_bytes;
647 }
648 
649 static struct sk_buff *netem_peek(struct netem_sched_data *q)
650 {
651 	struct sk_buff *skb = skb_rb_first(&q->t_root);
652 	u64 t1, t2;
653 
654 	if (!skb)
655 		return q->t_head;
656 	if (!q->t_head)
657 		return skb;
658 
659 	t1 = netem_skb_cb(skb)->time_to_send;
660 	t2 = netem_skb_cb(q->t_head)->time_to_send;
661 	if (t1 < t2)
662 		return skb;
663 	return q->t_head;
664 }
665 
666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
667 {
668 	if (skb == q->t_head) {
669 		q->t_head = skb->next;
670 		if (!q->t_head)
671 			q->t_tail = NULL;
672 	} else {
673 		rb_erase(&skb->rbnode, &q->t_root);
674 	}
675 }
676 
677 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
678 {
679 	struct netem_sched_data *q = qdisc_priv(sch);
680 	struct sk_buff *skb;
681 
682 tfifo_dequeue:
683 	skb = __qdisc_dequeue_head(&sch->q);
684 	if (skb) {
685 		qdisc_qstats_backlog_dec(sch, skb);
686 deliver:
687 		qdisc_bstats_update(sch, skb);
688 		return skb;
689 	}
690 	skb = netem_peek(q);
691 	if (skb) {
692 		u64 time_to_send;
693 		u64 now = ktime_get_ns();
694 
695 		/* if more time remaining? */
696 		time_to_send = netem_skb_cb(skb)->time_to_send;
697 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
698 			get_slot_next(q, now);
699 
700 		if (time_to_send <= now && q->slot.slot_next <= now) {
701 			netem_erase_head(q, skb);
702 			sch->q.qlen--;
703 			qdisc_qstats_backlog_dec(sch, skb);
704 			skb->next = NULL;
705 			skb->prev = NULL;
706 			/* skb->dev shares skb->rbnode area,
707 			 * we need to restore its value.
708 			 */
709 			skb->dev = qdisc_dev(sch);
710 
711 			if (q->slot.slot_next) {
712 				q->slot.packets_left--;
713 				q->slot.bytes_left -= qdisc_pkt_len(skb);
714 				if (q->slot.packets_left <= 0 ||
715 				    q->slot.bytes_left <= 0)
716 					get_slot_next(q, now);
717 			}
718 
719 			if (q->qdisc) {
720 				unsigned int pkt_len = qdisc_pkt_len(skb);
721 				struct sk_buff *to_free = NULL;
722 				int err;
723 
724 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
725 				kfree_skb_list(to_free);
726 				if (err != NET_XMIT_SUCCESS &&
727 				    net_xmit_drop_count(err)) {
728 					qdisc_qstats_drop(sch);
729 					qdisc_tree_reduce_backlog(sch, 1,
730 								  pkt_len);
731 				}
732 				goto tfifo_dequeue;
733 			}
734 			goto deliver;
735 		}
736 
737 		if (q->qdisc) {
738 			skb = q->qdisc->ops->dequeue(q->qdisc);
739 			if (skb)
740 				goto deliver;
741 		}
742 
743 		qdisc_watchdog_schedule_ns(&q->watchdog,
744 					   max(time_to_send,
745 					       q->slot.slot_next));
746 	}
747 
748 	if (q->qdisc) {
749 		skb = q->qdisc->ops->dequeue(q->qdisc);
750 		if (skb)
751 			goto deliver;
752 	}
753 	return NULL;
754 }
755 
756 static void netem_reset(struct Qdisc *sch)
757 {
758 	struct netem_sched_data *q = qdisc_priv(sch);
759 
760 	qdisc_reset_queue(sch);
761 	tfifo_reset(sch);
762 	if (q->qdisc)
763 		qdisc_reset(q->qdisc);
764 	qdisc_watchdog_cancel(&q->watchdog);
765 }
766 
767 static void dist_free(struct disttable *d)
768 {
769 	kvfree(d);
770 }
771 
772 /*
773  * Distribution data is a variable size payload containing
774  * signed 16 bit values.
775  */
776 
777 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
778 {
779 	size_t n = nla_len(attr)/sizeof(__s16);
780 	const __s16 *data = nla_data(attr);
781 	struct disttable *d;
782 	int i;
783 
784 	if (!n || n > NETEM_DIST_MAX)
785 		return -EINVAL;
786 
787 	d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
788 	if (!d)
789 		return -ENOMEM;
790 
791 	d->size = n;
792 	for (i = 0; i < n; i++)
793 		d->table[i] = data[i];
794 
795 	*tbl = d;
796 	return 0;
797 }
798 
799 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
800 {
801 	const struct tc_netem_slot *c = nla_data(attr);
802 
803 	q->slot_config = *c;
804 	if (q->slot_config.max_packets == 0)
805 		q->slot_config.max_packets = INT_MAX;
806 	if (q->slot_config.max_bytes == 0)
807 		q->slot_config.max_bytes = INT_MAX;
808 
809 	/* capping dist_jitter to the range acceptable by tabledist() */
810 	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
811 
812 	q->slot.packets_left = q->slot_config.max_packets;
813 	q->slot.bytes_left = q->slot_config.max_bytes;
814 	if (q->slot_config.min_delay | q->slot_config.max_delay |
815 	    q->slot_config.dist_jitter)
816 		q->slot.slot_next = ktime_get_ns();
817 	else
818 		q->slot.slot_next = 0;
819 }
820 
821 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
822 {
823 	const struct tc_netem_corr *c = nla_data(attr);
824 
825 	init_crandom(&q->delay_cor, c->delay_corr);
826 	init_crandom(&q->loss_cor, c->loss_corr);
827 	init_crandom(&q->dup_cor, c->dup_corr);
828 }
829 
830 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
831 {
832 	const struct tc_netem_reorder *r = nla_data(attr);
833 
834 	q->reorder = r->probability;
835 	init_crandom(&q->reorder_cor, r->correlation);
836 }
837 
838 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 	const struct tc_netem_corrupt *r = nla_data(attr);
841 
842 	q->corrupt = r->probability;
843 	init_crandom(&q->corrupt_cor, r->correlation);
844 }
845 
846 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 	const struct tc_netem_rate *r = nla_data(attr);
849 
850 	q->rate = r->rate;
851 	q->packet_overhead = r->packet_overhead;
852 	q->cell_size = r->cell_size;
853 	q->cell_overhead = r->cell_overhead;
854 	if (q->cell_size)
855 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
856 	else
857 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
858 }
859 
860 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
861 {
862 	const struct nlattr *la;
863 	int rem;
864 
865 	nla_for_each_nested(la, attr, rem) {
866 		u16 type = nla_type(la);
867 
868 		switch (type) {
869 		case NETEM_LOSS_GI: {
870 			const struct tc_netem_gimodel *gi = nla_data(la);
871 
872 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
873 				pr_info("netem: incorrect gi model size\n");
874 				return -EINVAL;
875 			}
876 
877 			q->loss_model = CLG_4_STATES;
878 
879 			q->clg.state = TX_IN_GAP_PERIOD;
880 			q->clg.a1 = gi->p13;
881 			q->clg.a2 = gi->p31;
882 			q->clg.a3 = gi->p32;
883 			q->clg.a4 = gi->p14;
884 			q->clg.a5 = gi->p23;
885 			break;
886 		}
887 
888 		case NETEM_LOSS_GE: {
889 			const struct tc_netem_gemodel *ge = nla_data(la);
890 
891 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
892 				pr_info("netem: incorrect ge model size\n");
893 				return -EINVAL;
894 			}
895 
896 			q->loss_model = CLG_GILB_ELL;
897 			q->clg.state = GOOD_STATE;
898 			q->clg.a1 = ge->p;
899 			q->clg.a2 = ge->r;
900 			q->clg.a3 = ge->h;
901 			q->clg.a4 = ge->k1;
902 			break;
903 		}
904 
905 		default:
906 			pr_info("netem: unknown loss type %u\n", type);
907 			return -EINVAL;
908 		}
909 	}
910 
911 	return 0;
912 }
913 
914 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
915 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
916 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
917 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
918 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
919 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
920 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
921 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
922 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
923 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
924 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
925 };
926 
927 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
928 		      const struct nla_policy *policy, int len)
929 {
930 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
931 
932 	if (nested_len < 0) {
933 		pr_info("netem: invalid attributes len %d\n", nested_len);
934 		return -EINVAL;
935 	}
936 
937 	if (nested_len >= nla_attr_size(0))
938 		return nla_parse_deprecated(tb, maxtype,
939 					    nla_data(nla) + NLA_ALIGN(len),
940 					    nested_len, policy, NULL);
941 
942 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
943 	return 0;
944 }
945 
946 /* Parse netlink message to set options */
947 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
948 			struct netlink_ext_ack *extack)
949 {
950 	struct netem_sched_data *q = qdisc_priv(sch);
951 	struct nlattr *tb[TCA_NETEM_MAX + 1];
952 	struct disttable *delay_dist = NULL;
953 	struct disttable *slot_dist = NULL;
954 	struct tc_netem_qopt *qopt;
955 	struct clgstate old_clg;
956 	int old_loss_model = CLG_RANDOM;
957 	int ret;
958 
959 	qopt = nla_data(opt);
960 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
961 	if (ret < 0)
962 		return ret;
963 
964 	if (tb[TCA_NETEM_DELAY_DIST]) {
965 		ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
966 		if (ret)
967 			goto table_free;
968 	}
969 
970 	if (tb[TCA_NETEM_SLOT_DIST]) {
971 		ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
972 		if (ret)
973 			goto table_free;
974 	}
975 
976 	sch_tree_lock(sch);
977 	/* backup q->clg and q->loss_model */
978 	old_clg = q->clg;
979 	old_loss_model = q->loss_model;
980 
981 	if (tb[TCA_NETEM_LOSS]) {
982 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
983 		if (ret) {
984 			q->loss_model = old_loss_model;
985 			q->clg = old_clg;
986 			goto unlock;
987 		}
988 	} else {
989 		q->loss_model = CLG_RANDOM;
990 	}
991 
992 	if (delay_dist)
993 		swap(q->delay_dist, delay_dist);
994 	if (slot_dist)
995 		swap(q->slot_dist, slot_dist);
996 	sch->limit = qopt->limit;
997 
998 	q->latency = PSCHED_TICKS2NS(qopt->latency);
999 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1000 	q->limit = qopt->limit;
1001 	q->gap = qopt->gap;
1002 	q->counter = 0;
1003 	q->loss = qopt->loss;
1004 	q->duplicate = qopt->duplicate;
1005 
1006 	/* for compatibility with earlier versions.
1007 	 * if gap is set, need to assume 100% probability
1008 	 */
1009 	if (q->gap)
1010 		q->reorder = ~0;
1011 
1012 	if (tb[TCA_NETEM_CORR])
1013 		get_correlation(q, tb[TCA_NETEM_CORR]);
1014 
1015 	if (tb[TCA_NETEM_REORDER])
1016 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1017 
1018 	if (tb[TCA_NETEM_CORRUPT])
1019 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1020 
1021 	if (tb[TCA_NETEM_RATE])
1022 		get_rate(q, tb[TCA_NETEM_RATE]);
1023 
1024 	if (tb[TCA_NETEM_RATE64])
1025 		q->rate = max_t(u64, q->rate,
1026 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1027 
1028 	if (tb[TCA_NETEM_LATENCY64])
1029 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1030 
1031 	if (tb[TCA_NETEM_JITTER64])
1032 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1033 
1034 	if (tb[TCA_NETEM_ECN])
1035 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1036 
1037 	if (tb[TCA_NETEM_SLOT])
1038 		get_slot(q, tb[TCA_NETEM_SLOT]);
1039 
1040 	/* capping jitter to the range acceptable by tabledist() */
1041 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1042 
1043 unlock:
1044 	sch_tree_unlock(sch);
1045 
1046 table_free:
1047 	dist_free(delay_dist);
1048 	dist_free(slot_dist);
1049 	return ret;
1050 }
1051 
1052 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1053 		      struct netlink_ext_ack *extack)
1054 {
1055 	struct netem_sched_data *q = qdisc_priv(sch);
1056 	int ret;
1057 
1058 	qdisc_watchdog_init(&q->watchdog, sch);
1059 
1060 	if (!opt)
1061 		return -EINVAL;
1062 
1063 	q->loss_model = CLG_RANDOM;
1064 	ret = netem_change(sch, opt, extack);
1065 	if (ret)
1066 		pr_info("netem: change failed\n");
1067 	return ret;
1068 }
1069 
1070 static void netem_destroy(struct Qdisc *sch)
1071 {
1072 	struct netem_sched_data *q = qdisc_priv(sch);
1073 
1074 	qdisc_watchdog_cancel(&q->watchdog);
1075 	if (q->qdisc)
1076 		qdisc_put(q->qdisc);
1077 	dist_free(q->delay_dist);
1078 	dist_free(q->slot_dist);
1079 }
1080 
1081 static int dump_loss_model(const struct netem_sched_data *q,
1082 			   struct sk_buff *skb)
1083 {
1084 	struct nlattr *nest;
1085 
1086 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1087 	if (nest == NULL)
1088 		goto nla_put_failure;
1089 
1090 	switch (q->loss_model) {
1091 	case CLG_RANDOM:
1092 		/* legacy loss model */
1093 		nla_nest_cancel(skb, nest);
1094 		return 0;	/* no data */
1095 
1096 	case CLG_4_STATES: {
1097 		struct tc_netem_gimodel gi = {
1098 			.p13 = q->clg.a1,
1099 			.p31 = q->clg.a2,
1100 			.p32 = q->clg.a3,
1101 			.p14 = q->clg.a4,
1102 			.p23 = q->clg.a5,
1103 		};
1104 
1105 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1106 			goto nla_put_failure;
1107 		break;
1108 	}
1109 	case CLG_GILB_ELL: {
1110 		struct tc_netem_gemodel ge = {
1111 			.p = q->clg.a1,
1112 			.r = q->clg.a2,
1113 			.h = q->clg.a3,
1114 			.k1 = q->clg.a4,
1115 		};
1116 
1117 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1118 			goto nla_put_failure;
1119 		break;
1120 	}
1121 	}
1122 
1123 	nla_nest_end(skb, nest);
1124 	return 0;
1125 
1126 nla_put_failure:
1127 	nla_nest_cancel(skb, nest);
1128 	return -1;
1129 }
1130 
1131 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1132 {
1133 	const struct netem_sched_data *q = qdisc_priv(sch);
1134 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1135 	struct tc_netem_qopt qopt;
1136 	struct tc_netem_corr cor;
1137 	struct tc_netem_reorder reorder;
1138 	struct tc_netem_corrupt corrupt;
1139 	struct tc_netem_rate rate;
1140 	struct tc_netem_slot slot;
1141 
1142 	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1143 			     UINT_MAX);
1144 	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1145 			    UINT_MAX);
1146 	qopt.limit = q->limit;
1147 	qopt.loss = q->loss;
1148 	qopt.gap = q->gap;
1149 	qopt.duplicate = q->duplicate;
1150 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1151 		goto nla_put_failure;
1152 
1153 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1154 		goto nla_put_failure;
1155 
1156 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1157 		goto nla_put_failure;
1158 
1159 	cor.delay_corr = q->delay_cor.rho;
1160 	cor.loss_corr = q->loss_cor.rho;
1161 	cor.dup_corr = q->dup_cor.rho;
1162 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1163 		goto nla_put_failure;
1164 
1165 	reorder.probability = q->reorder;
1166 	reorder.correlation = q->reorder_cor.rho;
1167 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1168 		goto nla_put_failure;
1169 
1170 	corrupt.probability = q->corrupt;
1171 	corrupt.correlation = q->corrupt_cor.rho;
1172 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1173 		goto nla_put_failure;
1174 
1175 	if (q->rate >= (1ULL << 32)) {
1176 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1177 				      TCA_NETEM_PAD))
1178 			goto nla_put_failure;
1179 		rate.rate = ~0U;
1180 	} else {
1181 		rate.rate = q->rate;
1182 	}
1183 	rate.packet_overhead = q->packet_overhead;
1184 	rate.cell_size = q->cell_size;
1185 	rate.cell_overhead = q->cell_overhead;
1186 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1187 		goto nla_put_failure;
1188 
1189 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1190 		goto nla_put_failure;
1191 
1192 	if (dump_loss_model(q, skb) != 0)
1193 		goto nla_put_failure;
1194 
1195 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1196 	    q->slot_config.dist_jitter) {
1197 		slot = q->slot_config;
1198 		if (slot.max_packets == INT_MAX)
1199 			slot.max_packets = 0;
1200 		if (slot.max_bytes == INT_MAX)
1201 			slot.max_bytes = 0;
1202 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1203 			goto nla_put_failure;
1204 	}
1205 
1206 	return nla_nest_end(skb, nla);
1207 
1208 nla_put_failure:
1209 	nlmsg_trim(skb, nla);
1210 	return -1;
1211 }
1212 
1213 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1214 			  struct sk_buff *skb, struct tcmsg *tcm)
1215 {
1216 	struct netem_sched_data *q = qdisc_priv(sch);
1217 
1218 	if (cl != 1 || !q->qdisc) 	/* only one class */
1219 		return -ENOENT;
1220 
1221 	tcm->tcm_handle |= TC_H_MIN(1);
1222 	tcm->tcm_info = q->qdisc->handle;
1223 
1224 	return 0;
1225 }
1226 
1227 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1228 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1229 {
1230 	struct netem_sched_data *q = qdisc_priv(sch);
1231 
1232 	*old = qdisc_replace(sch, new, &q->qdisc);
1233 	return 0;
1234 }
1235 
1236 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1237 {
1238 	struct netem_sched_data *q = qdisc_priv(sch);
1239 	return q->qdisc;
1240 }
1241 
1242 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1243 {
1244 	return 1;
1245 }
1246 
1247 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1248 {
1249 	if (!walker->stop) {
1250 		if (!tc_qdisc_stats_dump(sch, 1, walker))
1251 			return;
1252 	}
1253 }
1254 
1255 static const struct Qdisc_class_ops netem_class_ops = {
1256 	.graft		=	netem_graft,
1257 	.leaf		=	netem_leaf,
1258 	.find		=	netem_find,
1259 	.walk		=	netem_walk,
1260 	.dump		=	netem_dump_class,
1261 };
1262 
1263 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1264 	.id		=	"netem",
1265 	.cl_ops		=	&netem_class_ops,
1266 	.priv_size	=	sizeof(struct netem_sched_data),
1267 	.enqueue	=	netem_enqueue,
1268 	.dequeue	=	netem_dequeue,
1269 	.peek		=	qdisc_peek_dequeued,
1270 	.init		=	netem_init,
1271 	.reset		=	netem_reset,
1272 	.destroy	=	netem_destroy,
1273 	.change		=	netem_change,
1274 	.dump		=	netem_dump,
1275 	.owner		=	THIS_MODULE,
1276 };
1277 
1278 
1279 static int __init netem_module_init(void)
1280 {
1281 	pr_info("netem: version " VERSION "\n");
1282 	return register_qdisc(&netem_qdisc_ops);
1283 }
1284 static void __exit netem_module_exit(void)
1285 {
1286 	unregister_qdisc(&netem_qdisc_ops);
1287 }
1288 module_init(netem_module_init)
1289 module_exit(netem_module_exit)
1290 MODULE_LICENSE("GPL");
1291