xref: /openbmc/linux/net/sched/sch_netem.c (revision 5f2cf757f9c56255470c23a2a4a5574a34edad4b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_netem.c	Network emulator
4  *
5  *  		Many of the algorithms and ideas for this came from
6  *		NIST Net which is not copyrighted.
7  *
8  * Authors:	Stephen Hemminger <shemminger@osdl.org>
9  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10  */
11 
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/rtnetlink.h>
21 #include <linux/reciprocal_div.h>
22 #include <linux/rbtree.h>
23 
24 #include <net/gso.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 #include <net/inet_ecn.h>
28 
29 #define VERSION "1.3"
30 
31 /*	Network Emulation Queuing algorithm.
32 	====================================
33 
34 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 		 Network Emulation Tool
36 		 [2] Luigi Rizzo, DummyNet for FreeBSD
37 
38 	 ----------------------------------------------------------------
39 
40 	 This started out as a simple way to delay outgoing packets to
41 	 test TCP but has grown to include most of the functionality
42 	 of a full blown network emulator like NISTnet. It can delay
43 	 packets and add random jitter (and correlation). The random
44 	 distribution can be loaded from a table as well to provide
45 	 normal, Pareto, or experimental curves. Packet loss,
46 	 duplication, and reordering can also be emulated.
47 
48 	 This qdisc does not do classification that can be handled in
49 	 layering other disciplines.  It does not need to do bandwidth
50 	 control either since that can be handled by using token
51 	 bucket or other rate control.
52 
53      Correlated Loss Generator models
54 
55 	Added generation of correlated loss according to the
56 	"Gilbert-Elliot" model, a 4-state markov model.
57 
58 	References:
59 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
60 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
61 	and intuitive loss model for packet networks and its implementation
62 	in the Netem module in the Linux kernel", available in [1]
63 
64 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
65 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
66 */
67 
68 struct disttable {
69 	u32  size;
70 	s16 table[];
71 };
72 
73 struct netem_sched_data {
74 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
75 	struct rb_root t_root;
76 
77 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
78 	struct sk_buff	*t_head;
79 	struct sk_buff	*t_tail;
80 
81 	/* optional qdisc for classful handling (NULL at netem init) */
82 	struct Qdisc	*qdisc;
83 
84 	struct qdisc_watchdog watchdog;
85 
86 	s64 latency;
87 	s64 jitter;
88 
89 	u32 loss;
90 	u32 ecn;
91 	u32 limit;
92 	u32 counter;
93 	u32 gap;
94 	u32 duplicate;
95 	u32 reorder;
96 	u32 corrupt;
97 	u64 rate;
98 	s32 packet_overhead;
99 	u32 cell_size;
100 	struct reciprocal_value cell_size_reciprocal;
101 	s32 cell_overhead;
102 
103 	struct crndstate {
104 		u32 last;
105 		u32 rho;
106 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
107 
108 	struct disttable *delay_dist;
109 
110 	enum  {
111 		CLG_RANDOM,
112 		CLG_4_STATES,
113 		CLG_GILB_ELL,
114 	} loss_model;
115 
116 	enum {
117 		TX_IN_GAP_PERIOD = 1,
118 		TX_IN_BURST_PERIOD,
119 		LOST_IN_GAP_PERIOD,
120 		LOST_IN_BURST_PERIOD,
121 	} _4_state_model;
122 
123 	enum {
124 		GOOD_STATE = 1,
125 		BAD_STATE,
126 	} GE_state_model;
127 
128 	/* Correlated Loss Generation models */
129 	struct clgstate {
130 		/* state of the Markov chain */
131 		u8 state;
132 
133 		/* 4-states and Gilbert-Elliot models */
134 		u32 a1;	/* p13 for 4-states or p for GE */
135 		u32 a2;	/* p31 for 4-states or r for GE */
136 		u32 a3;	/* p32 for 4-states or h for GE */
137 		u32 a4;	/* p14 for 4-states or 1-k for GE */
138 		u32 a5; /* p23 used only in 4-states */
139 	} clg;
140 
141 	struct tc_netem_slot slot_config;
142 	struct slotstate {
143 		u64 slot_next;
144 		s32 packets_left;
145 		s32 bytes_left;
146 	} slot;
147 
148 	struct disttable *slot_dist;
149 };
150 
151 /* Time stamp put into socket buffer control block
152  * Only valid when skbs are in our internal t(ime)fifo queue.
153  *
154  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
155  * and skb->next & skb->prev are scratch space for a qdisc,
156  * we save skb->tstamp value in skb->cb[] before destroying it.
157  */
158 struct netem_skb_cb {
159 	u64	        time_to_send;
160 };
161 
162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
163 {
164 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
165 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
166 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
167 }
168 
169 /* init_crandom - initialize correlated random number generator
170  * Use entropy source for initial seed.
171  */
172 static void init_crandom(struct crndstate *state, unsigned long rho)
173 {
174 	state->rho = rho;
175 	state->last = get_random_u32();
176 }
177 
178 /* get_crandom - correlated random number generator
179  * Next number depends on last value.
180  * rho is scaled to avoid floating point.
181  */
182 static u32 get_crandom(struct crndstate *state)
183 {
184 	u64 value, rho;
185 	unsigned long answer;
186 
187 	if (!state || state->rho == 0)	/* no correlation */
188 		return get_random_u32();
189 
190 	value = get_random_u32();
191 	rho = (u64)state->rho + 1;
192 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
193 	state->last = answer;
194 	return answer;
195 }
196 
197 /* loss_4state - 4-state model loss generator
198  * Generates losses according to the 4-state Markov chain adopted in
199  * the GI (General and Intuitive) loss model.
200  */
201 static bool loss_4state(struct netem_sched_data *q)
202 {
203 	struct clgstate *clg = &q->clg;
204 	u32 rnd = get_random_u32();
205 
206 	/*
207 	 * Makes a comparison between rnd and the transition
208 	 * probabilities outgoing from the current state, then decides the
209 	 * next state and if the next packet has to be transmitted or lost.
210 	 * The four states correspond to:
211 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
212 	 *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
213 	 *   LOST_IN_BURST_PERIOD => lost packets within a burst period
214 	 *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
215 	 */
216 	switch (clg->state) {
217 	case TX_IN_GAP_PERIOD:
218 		if (rnd < clg->a4) {
219 			clg->state = LOST_IN_GAP_PERIOD;
220 			return true;
221 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a1 + clg->a4 < rnd) {
225 			clg->state = TX_IN_GAP_PERIOD;
226 		}
227 
228 		break;
229 	case TX_IN_BURST_PERIOD:
230 		if (rnd < clg->a5) {
231 			clg->state = LOST_IN_BURST_PERIOD;
232 			return true;
233 		} else {
234 			clg->state = TX_IN_BURST_PERIOD;
235 		}
236 
237 		break;
238 	case LOST_IN_BURST_PERIOD:
239 		if (rnd < clg->a3)
240 			clg->state = TX_IN_BURST_PERIOD;
241 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
242 			clg->state = TX_IN_GAP_PERIOD;
243 		} else if (clg->a2 + clg->a3 < rnd) {
244 			clg->state = LOST_IN_BURST_PERIOD;
245 			return true;
246 		}
247 		break;
248 	case LOST_IN_GAP_PERIOD:
249 		clg->state = TX_IN_GAP_PERIOD;
250 		break;
251 	}
252 
253 	return false;
254 }
255 
256 /* loss_gilb_ell - Gilbert-Elliot model loss generator
257  * Generates losses according to the Gilbert-Elliot loss model or
258  * its special cases  (Gilbert or Simple Gilbert)
259  *
260  * Makes a comparison between random number and the transition
261  * probabilities outgoing from the current state, then decides the
262  * next state. A second random number is extracted and the comparison
263  * with the loss probability of the current state decides if the next
264  * packet will be transmitted or lost.
265  */
266 static bool loss_gilb_ell(struct netem_sched_data *q)
267 {
268 	struct clgstate *clg = &q->clg;
269 
270 	switch (clg->state) {
271 	case GOOD_STATE:
272 		if (get_random_u32() < clg->a1)
273 			clg->state = BAD_STATE;
274 		if (get_random_u32() < clg->a4)
275 			return true;
276 		break;
277 	case BAD_STATE:
278 		if (get_random_u32() < clg->a2)
279 			clg->state = GOOD_STATE;
280 		if (get_random_u32() > clg->a3)
281 			return true;
282 	}
283 
284 	return false;
285 }
286 
287 static bool loss_event(struct netem_sched_data *q)
288 {
289 	switch (q->loss_model) {
290 	case CLG_RANDOM:
291 		/* Random packet drop 0 => none, ~0 => all */
292 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
293 
294 	case CLG_4_STATES:
295 		/* 4state loss model algorithm (used also for GI model)
296 		* Extracts a value from the markov 4 state loss generator,
297 		* if it is 1 drops a packet and if needed writes the event in
298 		* the kernel logs
299 		*/
300 		return loss_4state(q);
301 
302 	case CLG_GILB_ELL:
303 		/* Gilbert-Elliot loss model algorithm
304 		* Extracts a value from the Gilbert-Elliot loss generator,
305 		* if it is 1 drops a packet and if needed writes the event in
306 		* the kernel logs
307 		*/
308 		return loss_gilb_ell(q);
309 	}
310 
311 	return false;	/* not reached */
312 }
313 
314 
315 /* tabledist - return a pseudo-randomly distributed value with mean mu and
316  * std deviation sigma.  Uses table lookup to approximate the desired
317  * distribution, and a uniformly-distributed pseudo-random source.
318  */
319 static s64 tabledist(s64 mu, s32 sigma,
320 		     struct crndstate *state,
321 		     const struct disttable *dist)
322 {
323 	s64 x;
324 	long t;
325 	u32 rnd;
326 
327 	if (sigma == 0)
328 		return mu;
329 
330 	rnd = get_crandom(state);
331 
332 	/* default uniform distribution */
333 	if (dist == NULL)
334 		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
335 
336 	t = dist->table[rnd % dist->size];
337 	x = (sigma % NETEM_DIST_SCALE) * t;
338 	if (x >= 0)
339 		x += NETEM_DIST_SCALE/2;
340 	else
341 		x -= NETEM_DIST_SCALE/2;
342 
343 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
344 }
345 
346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
347 {
348 	len += q->packet_overhead;
349 
350 	if (q->cell_size) {
351 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
352 
353 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
354 			cells++;
355 		len = cells * (q->cell_size + q->cell_overhead);
356 	}
357 
358 	return div64_u64(len * NSEC_PER_SEC, q->rate);
359 }
360 
361 static void tfifo_reset(struct Qdisc *sch)
362 {
363 	struct netem_sched_data *q = qdisc_priv(sch);
364 	struct rb_node *p = rb_first(&q->t_root);
365 
366 	while (p) {
367 		struct sk_buff *skb = rb_to_skb(p);
368 
369 		p = rb_next(p);
370 		rb_erase(&skb->rbnode, &q->t_root);
371 		rtnl_kfree_skbs(skb, skb);
372 	}
373 
374 	rtnl_kfree_skbs(q->t_head, q->t_tail);
375 	q->t_head = NULL;
376 	q->t_tail = NULL;
377 }
378 
379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
380 {
381 	struct netem_sched_data *q = qdisc_priv(sch);
382 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
383 
384 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
385 		if (q->t_tail)
386 			q->t_tail->next = nskb;
387 		else
388 			q->t_head = nskb;
389 		q->t_tail = nskb;
390 	} else {
391 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
392 
393 		while (*p) {
394 			struct sk_buff *skb;
395 
396 			parent = *p;
397 			skb = rb_to_skb(parent);
398 			if (tnext >= netem_skb_cb(skb)->time_to_send)
399 				p = &parent->rb_right;
400 			else
401 				p = &parent->rb_left;
402 		}
403 		rb_link_node(&nskb->rbnode, parent, p);
404 		rb_insert_color(&nskb->rbnode, &q->t_root);
405 	}
406 	sch->q.qlen++;
407 }
408 
409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
410  * when we statistically choose to corrupt one, we instead segment it, returning
411  * the first packet to be corrupted, and re-enqueue the remaining frames
412  */
413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
414 				     struct sk_buff **to_free)
415 {
416 	struct sk_buff *segs;
417 	netdev_features_t features = netif_skb_features(skb);
418 
419 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
420 
421 	if (IS_ERR_OR_NULL(segs)) {
422 		qdisc_drop(skb, sch, to_free);
423 		return NULL;
424 	}
425 	consume_skb(skb);
426 	return segs;
427 }
428 
429 /*
430  * Insert one skb into qdisc.
431  * Note: parent depends on return value to account for queue length.
432  * 	NET_XMIT_DROP: queue length didn't change.
433  *      NET_XMIT_SUCCESS: one skb was queued.
434  */
435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
436 			 struct sk_buff **to_free)
437 {
438 	struct netem_sched_data *q = qdisc_priv(sch);
439 	/* We don't fill cb now as skb_unshare() may invalidate it */
440 	struct netem_skb_cb *cb;
441 	struct sk_buff *skb2;
442 	struct sk_buff *segs = NULL;
443 	unsigned int prev_len = qdisc_pkt_len(skb);
444 	int count = 1;
445 	int rc = NET_XMIT_SUCCESS;
446 	int rc_drop = NET_XMIT_DROP;
447 
448 	/* Do not fool qdisc_drop_all() */
449 	skb->prev = NULL;
450 
451 	/* Random duplication */
452 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
453 		++count;
454 
455 	/* Drop packet? */
456 	if (loss_event(q)) {
457 		if (q->ecn && INET_ECN_set_ce(skb))
458 			qdisc_qstats_drop(sch); /* mark packet */
459 		else
460 			--count;
461 	}
462 	if (count == 0) {
463 		qdisc_qstats_drop(sch);
464 		__qdisc_drop(skb, to_free);
465 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
466 	}
467 
468 	/* If a delay is expected, orphan the skb. (orphaning usually takes
469 	 * place at TX completion time, so _before_ the link transit delay)
470 	 */
471 	if (q->latency || q->jitter || q->rate)
472 		skb_orphan_partial(skb);
473 
474 	/*
475 	 * If we need to duplicate packet, then re-insert at top of the
476 	 * qdisc tree, since parent queuer expects that only one
477 	 * skb will be queued.
478 	 */
479 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
480 		struct Qdisc *rootq = qdisc_root_bh(sch);
481 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
482 
483 		q->duplicate = 0;
484 		rootq->enqueue(skb2, rootq, to_free);
485 		q->duplicate = dupsave;
486 		rc_drop = NET_XMIT_SUCCESS;
487 	}
488 
489 	/*
490 	 * Randomized packet corruption.
491 	 * Make copy if needed since we are modifying
492 	 * If packet is going to be hardware checksummed, then
493 	 * do it now in software before we mangle it.
494 	 */
495 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
496 		if (skb_is_gso(skb)) {
497 			skb = netem_segment(skb, sch, to_free);
498 			if (!skb)
499 				return rc_drop;
500 			segs = skb->next;
501 			skb_mark_not_on_list(skb);
502 			qdisc_skb_cb(skb)->pkt_len = skb->len;
503 		}
504 
505 		skb = skb_unshare(skb, GFP_ATOMIC);
506 		if (unlikely(!skb)) {
507 			qdisc_qstats_drop(sch);
508 			goto finish_segs;
509 		}
510 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
511 		    skb_checksum_help(skb)) {
512 			qdisc_drop(skb, sch, to_free);
513 			skb = NULL;
514 			goto finish_segs;
515 		}
516 
517 		skb->data[get_random_u32_below(skb_headlen(skb))] ^=
518 			1<<get_random_u32_below(8);
519 	}
520 
521 	if (unlikely(sch->q.qlen >= sch->limit)) {
522 		/* re-link segs, so that qdisc_drop_all() frees them all */
523 		skb->next = segs;
524 		qdisc_drop_all(skb, sch, to_free);
525 		return rc_drop;
526 	}
527 
528 	qdisc_qstats_backlog_inc(sch, skb);
529 
530 	cb = netem_skb_cb(skb);
531 	if (q->gap == 0 ||		/* not doing reordering */
532 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
533 	    q->reorder < get_crandom(&q->reorder_cor)) {
534 		u64 now;
535 		s64 delay;
536 
537 		delay = tabledist(q->latency, q->jitter,
538 				  &q->delay_cor, q->delay_dist);
539 
540 		now = ktime_get_ns();
541 
542 		if (q->rate) {
543 			struct netem_skb_cb *last = NULL;
544 
545 			if (sch->q.tail)
546 				last = netem_skb_cb(sch->q.tail);
547 			if (q->t_root.rb_node) {
548 				struct sk_buff *t_skb;
549 				struct netem_skb_cb *t_last;
550 
551 				t_skb = skb_rb_last(&q->t_root);
552 				t_last = netem_skb_cb(t_skb);
553 				if (!last ||
554 				    t_last->time_to_send > last->time_to_send)
555 					last = t_last;
556 			}
557 			if (q->t_tail) {
558 				struct netem_skb_cb *t_last =
559 					netem_skb_cb(q->t_tail);
560 
561 				if (!last ||
562 				    t_last->time_to_send > last->time_to_send)
563 					last = t_last;
564 			}
565 
566 			if (last) {
567 				/*
568 				 * Last packet in queue is reference point (now),
569 				 * calculate this time bonus and subtract
570 				 * from delay.
571 				 */
572 				delay -= last->time_to_send - now;
573 				delay = max_t(s64, 0, delay);
574 				now = last->time_to_send;
575 			}
576 
577 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
578 		}
579 
580 		cb->time_to_send = now + delay;
581 		++q->counter;
582 		tfifo_enqueue(skb, sch);
583 	} else {
584 		/*
585 		 * Do re-ordering by putting one out of N packets at the front
586 		 * of the queue.
587 		 */
588 		cb->time_to_send = ktime_get_ns();
589 		q->counter = 0;
590 
591 		__qdisc_enqueue_head(skb, &sch->q);
592 		sch->qstats.requeues++;
593 	}
594 
595 finish_segs:
596 	if (segs) {
597 		unsigned int len, last_len;
598 		int nb;
599 
600 		len = skb ? skb->len : 0;
601 		nb = skb ? 1 : 0;
602 
603 		while (segs) {
604 			skb2 = segs->next;
605 			skb_mark_not_on_list(segs);
606 			qdisc_skb_cb(segs)->pkt_len = segs->len;
607 			last_len = segs->len;
608 			rc = qdisc_enqueue(segs, sch, to_free);
609 			if (rc != NET_XMIT_SUCCESS) {
610 				if (net_xmit_drop_count(rc))
611 					qdisc_qstats_drop(sch);
612 			} else {
613 				nb++;
614 				len += last_len;
615 			}
616 			segs = skb2;
617 		}
618 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
619 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
620 	} else if (!skb) {
621 		return NET_XMIT_DROP;
622 	}
623 	return NET_XMIT_SUCCESS;
624 }
625 
626 /* Delay the next round with a new future slot with a
627  * correct number of bytes and packets.
628  */
629 
630 static void get_slot_next(struct netem_sched_data *q, u64 now)
631 {
632 	s64 next_delay;
633 
634 	if (!q->slot_dist)
635 		next_delay = q->slot_config.min_delay +
636 				(get_random_u32() *
637 				 (q->slot_config.max_delay -
638 				  q->slot_config.min_delay) >> 32);
639 	else
640 		next_delay = tabledist(q->slot_config.dist_delay,
641 				       (s32)(q->slot_config.dist_jitter),
642 				       NULL, q->slot_dist);
643 
644 	q->slot.slot_next = now + next_delay;
645 	q->slot.packets_left = q->slot_config.max_packets;
646 	q->slot.bytes_left = q->slot_config.max_bytes;
647 }
648 
649 static struct sk_buff *netem_peek(struct netem_sched_data *q)
650 {
651 	struct sk_buff *skb = skb_rb_first(&q->t_root);
652 	u64 t1, t2;
653 
654 	if (!skb)
655 		return q->t_head;
656 	if (!q->t_head)
657 		return skb;
658 
659 	t1 = netem_skb_cb(skb)->time_to_send;
660 	t2 = netem_skb_cb(q->t_head)->time_to_send;
661 	if (t1 < t2)
662 		return skb;
663 	return q->t_head;
664 }
665 
666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
667 {
668 	if (skb == q->t_head) {
669 		q->t_head = skb->next;
670 		if (!q->t_head)
671 			q->t_tail = NULL;
672 	} else {
673 		rb_erase(&skb->rbnode, &q->t_root);
674 	}
675 }
676 
677 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
678 {
679 	struct netem_sched_data *q = qdisc_priv(sch);
680 	struct sk_buff *skb;
681 
682 tfifo_dequeue:
683 	skb = __qdisc_dequeue_head(&sch->q);
684 	if (skb) {
685 		qdisc_qstats_backlog_dec(sch, skb);
686 deliver:
687 		qdisc_bstats_update(sch, skb);
688 		return skb;
689 	}
690 	skb = netem_peek(q);
691 	if (skb) {
692 		u64 time_to_send;
693 		u64 now = ktime_get_ns();
694 
695 		/* if more time remaining? */
696 		time_to_send = netem_skb_cb(skb)->time_to_send;
697 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
698 			get_slot_next(q, now);
699 
700 		if (time_to_send <= now && q->slot.slot_next <= now) {
701 			netem_erase_head(q, skb);
702 			sch->q.qlen--;
703 			qdisc_qstats_backlog_dec(sch, skb);
704 			skb->next = NULL;
705 			skb->prev = NULL;
706 			/* skb->dev shares skb->rbnode area,
707 			 * we need to restore its value.
708 			 */
709 			skb->dev = qdisc_dev(sch);
710 
711 			if (q->slot.slot_next) {
712 				q->slot.packets_left--;
713 				q->slot.bytes_left -= qdisc_pkt_len(skb);
714 				if (q->slot.packets_left <= 0 ||
715 				    q->slot.bytes_left <= 0)
716 					get_slot_next(q, now);
717 			}
718 
719 			if (q->qdisc) {
720 				unsigned int pkt_len = qdisc_pkt_len(skb);
721 				struct sk_buff *to_free = NULL;
722 				int err;
723 
724 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
725 				kfree_skb_list(to_free);
726 				if (err != NET_XMIT_SUCCESS &&
727 				    net_xmit_drop_count(err)) {
728 					qdisc_qstats_drop(sch);
729 					qdisc_tree_reduce_backlog(sch, 1,
730 								  pkt_len);
731 				}
732 				goto tfifo_dequeue;
733 			}
734 			goto deliver;
735 		}
736 
737 		if (q->qdisc) {
738 			skb = q->qdisc->ops->dequeue(q->qdisc);
739 			if (skb)
740 				goto deliver;
741 		}
742 
743 		qdisc_watchdog_schedule_ns(&q->watchdog,
744 					   max(time_to_send,
745 					       q->slot.slot_next));
746 	}
747 
748 	if (q->qdisc) {
749 		skb = q->qdisc->ops->dequeue(q->qdisc);
750 		if (skb)
751 			goto deliver;
752 	}
753 	return NULL;
754 }
755 
756 static void netem_reset(struct Qdisc *sch)
757 {
758 	struct netem_sched_data *q = qdisc_priv(sch);
759 
760 	qdisc_reset_queue(sch);
761 	tfifo_reset(sch);
762 	if (q->qdisc)
763 		qdisc_reset(q->qdisc);
764 	qdisc_watchdog_cancel(&q->watchdog);
765 }
766 
767 static void dist_free(struct disttable *d)
768 {
769 	kvfree(d);
770 }
771 
772 /*
773  * Distribution data is a variable size payload containing
774  * signed 16 bit values.
775  */
776 
777 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
778 			  const struct nlattr *attr)
779 {
780 	size_t n = nla_len(attr)/sizeof(__s16);
781 	const __s16 *data = nla_data(attr);
782 	spinlock_t *root_lock;
783 	struct disttable *d;
784 	int i;
785 
786 	if (!n || n > NETEM_DIST_MAX)
787 		return -EINVAL;
788 
789 	d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
790 	if (!d)
791 		return -ENOMEM;
792 
793 	d->size = n;
794 	for (i = 0; i < n; i++)
795 		d->table[i] = data[i];
796 
797 	root_lock = qdisc_root_sleeping_lock(sch);
798 
799 	spin_lock_bh(root_lock);
800 	swap(*tbl, d);
801 	spin_unlock_bh(root_lock);
802 
803 	dist_free(d);
804 	return 0;
805 }
806 
807 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
808 {
809 	const struct tc_netem_slot *c = nla_data(attr);
810 
811 	q->slot_config = *c;
812 	if (q->slot_config.max_packets == 0)
813 		q->slot_config.max_packets = INT_MAX;
814 	if (q->slot_config.max_bytes == 0)
815 		q->slot_config.max_bytes = INT_MAX;
816 
817 	/* capping dist_jitter to the range acceptable by tabledist() */
818 	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
819 
820 	q->slot.packets_left = q->slot_config.max_packets;
821 	q->slot.bytes_left = q->slot_config.max_bytes;
822 	if (q->slot_config.min_delay | q->slot_config.max_delay |
823 	    q->slot_config.dist_jitter)
824 		q->slot.slot_next = ktime_get_ns();
825 	else
826 		q->slot.slot_next = 0;
827 }
828 
829 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
830 {
831 	const struct tc_netem_corr *c = nla_data(attr);
832 
833 	init_crandom(&q->delay_cor, c->delay_corr);
834 	init_crandom(&q->loss_cor, c->loss_corr);
835 	init_crandom(&q->dup_cor, c->dup_corr);
836 }
837 
838 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 	const struct tc_netem_reorder *r = nla_data(attr);
841 
842 	q->reorder = r->probability;
843 	init_crandom(&q->reorder_cor, r->correlation);
844 }
845 
846 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 	const struct tc_netem_corrupt *r = nla_data(attr);
849 
850 	q->corrupt = r->probability;
851 	init_crandom(&q->corrupt_cor, r->correlation);
852 }
853 
854 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
855 {
856 	const struct tc_netem_rate *r = nla_data(attr);
857 
858 	q->rate = r->rate;
859 	q->packet_overhead = r->packet_overhead;
860 	q->cell_size = r->cell_size;
861 	q->cell_overhead = r->cell_overhead;
862 	if (q->cell_size)
863 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
864 	else
865 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
866 }
867 
868 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
869 {
870 	const struct nlattr *la;
871 	int rem;
872 
873 	nla_for_each_nested(la, attr, rem) {
874 		u16 type = nla_type(la);
875 
876 		switch (type) {
877 		case NETEM_LOSS_GI: {
878 			const struct tc_netem_gimodel *gi = nla_data(la);
879 
880 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
881 				pr_info("netem: incorrect gi model size\n");
882 				return -EINVAL;
883 			}
884 
885 			q->loss_model = CLG_4_STATES;
886 
887 			q->clg.state = TX_IN_GAP_PERIOD;
888 			q->clg.a1 = gi->p13;
889 			q->clg.a2 = gi->p31;
890 			q->clg.a3 = gi->p32;
891 			q->clg.a4 = gi->p14;
892 			q->clg.a5 = gi->p23;
893 			break;
894 		}
895 
896 		case NETEM_LOSS_GE: {
897 			const struct tc_netem_gemodel *ge = nla_data(la);
898 
899 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
900 				pr_info("netem: incorrect ge model size\n");
901 				return -EINVAL;
902 			}
903 
904 			q->loss_model = CLG_GILB_ELL;
905 			q->clg.state = GOOD_STATE;
906 			q->clg.a1 = ge->p;
907 			q->clg.a2 = ge->r;
908 			q->clg.a3 = ge->h;
909 			q->clg.a4 = ge->k1;
910 			break;
911 		}
912 
913 		default:
914 			pr_info("netem: unknown loss type %u\n", type);
915 			return -EINVAL;
916 		}
917 	}
918 
919 	return 0;
920 }
921 
922 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
923 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
924 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
925 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
926 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
927 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
928 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
929 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
930 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
931 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
932 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
933 };
934 
935 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
936 		      const struct nla_policy *policy, int len)
937 {
938 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
939 
940 	if (nested_len < 0) {
941 		pr_info("netem: invalid attributes len %d\n", nested_len);
942 		return -EINVAL;
943 	}
944 
945 	if (nested_len >= nla_attr_size(0))
946 		return nla_parse_deprecated(tb, maxtype,
947 					    nla_data(nla) + NLA_ALIGN(len),
948 					    nested_len, policy, NULL);
949 
950 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
951 	return 0;
952 }
953 
954 /* Parse netlink message to set options */
955 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
956 			struct netlink_ext_ack *extack)
957 {
958 	struct netem_sched_data *q = qdisc_priv(sch);
959 	struct nlattr *tb[TCA_NETEM_MAX + 1];
960 	struct tc_netem_qopt *qopt;
961 	struct clgstate old_clg;
962 	int old_loss_model = CLG_RANDOM;
963 	int ret;
964 
965 	qopt = nla_data(opt);
966 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
967 	if (ret < 0)
968 		return ret;
969 
970 	/* backup q->clg and q->loss_model */
971 	old_clg = q->clg;
972 	old_loss_model = q->loss_model;
973 
974 	if (tb[TCA_NETEM_LOSS]) {
975 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
976 		if (ret) {
977 			q->loss_model = old_loss_model;
978 			return ret;
979 		}
980 	} else {
981 		q->loss_model = CLG_RANDOM;
982 	}
983 
984 	if (tb[TCA_NETEM_DELAY_DIST]) {
985 		ret = get_dist_table(sch, &q->delay_dist,
986 				     tb[TCA_NETEM_DELAY_DIST]);
987 		if (ret)
988 			goto get_table_failure;
989 	}
990 
991 	if (tb[TCA_NETEM_SLOT_DIST]) {
992 		ret = get_dist_table(sch, &q->slot_dist,
993 				     tb[TCA_NETEM_SLOT_DIST]);
994 		if (ret)
995 			goto get_table_failure;
996 	}
997 
998 	sch->limit = qopt->limit;
999 
1000 	q->latency = PSCHED_TICKS2NS(qopt->latency);
1001 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1002 	q->limit = qopt->limit;
1003 	q->gap = qopt->gap;
1004 	q->counter = 0;
1005 	q->loss = qopt->loss;
1006 	q->duplicate = qopt->duplicate;
1007 
1008 	/* for compatibility with earlier versions.
1009 	 * if gap is set, need to assume 100% probability
1010 	 */
1011 	if (q->gap)
1012 		q->reorder = ~0;
1013 
1014 	if (tb[TCA_NETEM_CORR])
1015 		get_correlation(q, tb[TCA_NETEM_CORR]);
1016 
1017 	if (tb[TCA_NETEM_REORDER])
1018 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1019 
1020 	if (tb[TCA_NETEM_CORRUPT])
1021 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1022 
1023 	if (tb[TCA_NETEM_RATE])
1024 		get_rate(q, tb[TCA_NETEM_RATE]);
1025 
1026 	if (tb[TCA_NETEM_RATE64])
1027 		q->rate = max_t(u64, q->rate,
1028 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1029 
1030 	if (tb[TCA_NETEM_LATENCY64])
1031 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1032 
1033 	if (tb[TCA_NETEM_JITTER64])
1034 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1035 
1036 	if (tb[TCA_NETEM_ECN])
1037 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1038 
1039 	if (tb[TCA_NETEM_SLOT])
1040 		get_slot(q, tb[TCA_NETEM_SLOT]);
1041 
1042 	/* capping jitter to the range acceptable by tabledist() */
1043 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1044 
1045 	return ret;
1046 
1047 get_table_failure:
1048 	/* recover clg and loss_model, in case of
1049 	 * q->clg and q->loss_model were modified
1050 	 * in get_loss_clg()
1051 	 */
1052 	q->clg = old_clg;
1053 	q->loss_model = old_loss_model;
1054 	return ret;
1055 }
1056 
1057 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1058 		      struct netlink_ext_ack *extack)
1059 {
1060 	struct netem_sched_data *q = qdisc_priv(sch);
1061 	int ret;
1062 
1063 	qdisc_watchdog_init(&q->watchdog, sch);
1064 
1065 	if (!opt)
1066 		return -EINVAL;
1067 
1068 	q->loss_model = CLG_RANDOM;
1069 	ret = netem_change(sch, opt, extack);
1070 	if (ret)
1071 		pr_info("netem: change failed\n");
1072 	return ret;
1073 }
1074 
1075 static void netem_destroy(struct Qdisc *sch)
1076 {
1077 	struct netem_sched_data *q = qdisc_priv(sch);
1078 
1079 	qdisc_watchdog_cancel(&q->watchdog);
1080 	if (q->qdisc)
1081 		qdisc_put(q->qdisc);
1082 	dist_free(q->delay_dist);
1083 	dist_free(q->slot_dist);
1084 }
1085 
1086 static int dump_loss_model(const struct netem_sched_data *q,
1087 			   struct sk_buff *skb)
1088 {
1089 	struct nlattr *nest;
1090 
1091 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1092 	if (nest == NULL)
1093 		goto nla_put_failure;
1094 
1095 	switch (q->loss_model) {
1096 	case CLG_RANDOM:
1097 		/* legacy loss model */
1098 		nla_nest_cancel(skb, nest);
1099 		return 0;	/* no data */
1100 
1101 	case CLG_4_STATES: {
1102 		struct tc_netem_gimodel gi = {
1103 			.p13 = q->clg.a1,
1104 			.p31 = q->clg.a2,
1105 			.p32 = q->clg.a3,
1106 			.p14 = q->clg.a4,
1107 			.p23 = q->clg.a5,
1108 		};
1109 
1110 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1111 			goto nla_put_failure;
1112 		break;
1113 	}
1114 	case CLG_GILB_ELL: {
1115 		struct tc_netem_gemodel ge = {
1116 			.p = q->clg.a1,
1117 			.r = q->clg.a2,
1118 			.h = q->clg.a3,
1119 			.k1 = q->clg.a4,
1120 		};
1121 
1122 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1123 			goto nla_put_failure;
1124 		break;
1125 	}
1126 	}
1127 
1128 	nla_nest_end(skb, nest);
1129 	return 0;
1130 
1131 nla_put_failure:
1132 	nla_nest_cancel(skb, nest);
1133 	return -1;
1134 }
1135 
1136 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1137 {
1138 	const struct netem_sched_data *q = qdisc_priv(sch);
1139 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1140 	struct tc_netem_qopt qopt;
1141 	struct tc_netem_corr cor;
1142 	struct tc_netem_reorder reorder;
1143 	struct tc_netem_corrupt corrupt;
1144 	struct tc_netem_rate rate;
1145 	struct tc_netem_slot slot;
1146 
1147 	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1148 			     UINT_MAX);
1149 	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1150 			    UINT_MAX);
1151 	qopt.limit = q->limit;
1152 	qopt.loss = q->loss;
1153 	qopt.gap = q->gap;
1154 	qopt.duplicate = q->duplicate;
1155 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1156 		goto nla_put_failure;
1157 
1158 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1159 		goto nla_put_failure;
1160 
1161 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1162 		goto nla_put_failure;
1163 
1164 	cor.delay_corr = q->delay_cor.rho;
1165 	cor.loss_corr = q->loss_cor.rho;
1166 	cor.dup_corr = q->dup_cor.rho;
1167 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1168 		goto nla_put_failure;
1169 
1170 	reorder.probability = q->reorder;
1171 	reorder.correlation = q->reorder_cor.rho;
1172 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1173 		goto nla_put_failure;
1174 
1175 	corrupt.probability = q->corrupt;
1176 	corrupt.correlation = q->corrupt_cor.rho;
1177 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1178 		goto nla_put_failure;
1179 
1180 	if (q->rate >= (1ULL << 32)) {
1181 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1182 				      TCA_NETEM_PAD))
1183 			goto nla_put_failure;
1184 		rate.rate = ~0U;
1185 	} else {
1186 		rate.rate = q->rate;
1187 	}
1188 	rate.packet_overhead = q->packet_overhead;
1189 	rate.cell_size = q->cell_size;
1190 	rate.cell_overhead = q->cell_overhead;
1191 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1192 		goto nla_put_failure;
1193 
1194 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1195 		goto nla_put_failure;
1196 
1197 	if (dump_loss_model(q, skb) != 0)
1198 		goto nla_put_failure;
1199 
1200 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1201 	    q->slot_config.dist_jitter) {
1202 		slot = q->slot_config;
1203 		if (slot.max_packets == INT_MAX)
1204 			slot.max_packets = 0;
1205 		if (slot.max_bytes == INT_MAX)
1206 			slot.max_bytes = 0;
1207 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1208 			goto nla_put_failure;
1209 	}
1210 
1211 	return nla_nest_end(skb, nla);
1212 
1213 nla_put_failure:
1214 	nlmsg_trim(skb, nla);
1215 	return -1;
1216 }
1217 
1218 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1219 			  struct sk_buff *skb, struct tcmsg *tcm)
1220 {
1221 	struct netem_sched_data *q = qdisc_priv(sch);
1222 
1223 	if (cl != 1 || !q->qdisc) 	/* only one class */
1224 		return -ENOENT;
1225 
1226 	tcm->tcm_handle |= TC_H_MIN(1);
1227 	tcm->tcm_info = q->qdisc->handle;
1228 
1229 	return 0;
1230 }
1231 
1232 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1233 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1234 {
1235 	struct netem_sched_data *q = qdisc_priv(sch);
1236 
1237 	*old = qdisc_replace(sch, new, &q->qdisc);
1238 	return 0;
1239 }
1240 
1241 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1242 {
1243 	struct netem_sched_data *q = qdisc_priv(sch);
1244 	return q->qdisc;
1245 }
1246 
1247 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1248 {
1249 	return 1;
1250 }
1251 
1252 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1253 {
1254 	if (!walker->stop) {
1255 		if (!tc_qdisc_stats_dump(sch, 1, walker))
1256 			return;
1257 	}
1258 }
1259 
1260 static const struct Qdisc_class_ops netem_class_ops = {
1261 	.graft		=	netem_graft,
1262 	.leaf		=	netem_leaf,
1263 	.find		=	netem_find,
1264 	.walk		=	netem_walk,
1265 	.dump		=	netem_dump_class,
1266 };
1267 
1268 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1269 	.id		=	"netem",
1270 	.cl_ops		=	&netem_class_ops,
1271 	.priv_size	=	sizeof(struct netem_sched_data),
1272 	.enqueue	=	netem_enqueue,
1273 	.dequeue	=	netem_dequeue,
1274 	.peek		=	qdisc_peek_dequeued,
1275 	.init		=	netem_init,
1276 	.reset		=	netem_reset,
1277 	.destroy	=	netem_destroy,
1278 	.change		=	netem_change,
1279 	.dump		=	netem_dump,
1280 	.owner		=	THIS_MODULE,
1281 };
1282 
1283 
1284 static int __init netem_module_init(void)
1285 {
1286 	pr_info("netem: version " VERSION "\n");
1287 	return register_qdisc(&netem_qdisc_ops);
1288 }
1289 static void __exit netem_module_exit(void)
1290 {
1291 	unregister_qdisc(&netem_qdisc_ops);
1292 }
1293 module_init(netem_module_init)
1294 module_exit(netem_module_exit)
1295 MODULE_LICENSE("GPL");
1296