xref: /openbmc/linux/net/sched/sch_netem.c (revision d5a05299)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_netem.c	Network emulator
4  *
5  *  		Many of the algorithms and ideas for this came from
6  *		NIST Net which is not copyrighted.
7  *
8  * Authors:	Stephen Hemminger <shemminger@osdl.org>
9  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10  */
11 
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/rtnetlink.h>
21 #include <linux/reciprocal_div.h>
22 #include <linux/rbtree.h>
23 
24 #include <net/gso.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 #include <net/inet_ecn.h>
28 
29 #define VERSION "1.3"
30 
31 /*	Network Emulation Queuing algorithm.
32 	====================================
33 
34 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 		 Network Emulation Tool
36 		 [2] Luigi Rizzo, DummyNet for FreeBSD
37 
38 	 ----------------------------------------------------------------
39 
40 	 This started out as a simple way to delay outgoing packets to
41 	 test TCP but has grown to include most of the functionality
42 	 of a full blown network emulator like NISTnet. It can delay
43 	 packets and add random jitter (and correlation). The random
44 	 distribution can be loaded from a table as well to provide
45 	 normal, Pareto, or experimental curves. Packet loss,
46 	 duplication, and reordering can also be emulated.
47 
48 	 This qdisc does not do classification that can be handled in
49 	 layering other disciplines.  It does not need to do bandwidth
50 	 control either since that can be handled by using token
51 	 bucket or other rate control.
52 
53      Correlated Loss Generator models
54 
55 	Added generation of correlated loss according to the
56 	"Gilbert-Elliot" model, a 4-state markov model.
57 
58 	References:
59 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
60 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
61 	and intuitive loss model for packet networks and its implementation
62 	in the Netem module in the Linux kernel", available in [1]
63 
64 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
65 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
66 */
67 
68 struct disttable {
69 	u32  size;
70 	s16 table[];
71 };
72 
73 struct netem_sched_data {
74 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
75 	struct rb_root t_root;
76 
77 	/* a linear queue; reduces rbtree rebalancing when jitter is low */
78 	struct sk_buff	*t_head;
79 	struct sk_buff	*t_tail;
80 
81 	/* optional qdisc for classful handling (NULL at netem init) */
82 	struct Qdisc	*qdisc;
83 
84 	struct qdisc_watchdog watchdog;
85 
86 	s64 latency;
87 	s64 jitter;
88 
89 	u32 loss;
90 	u32 ecn;
91 	u32 limit;
92 	u32 counter;
93 	u32 gap;
94 	u32 duplicate;
95 	u32 reorder;
96 	u32 corrupt;
97 	u64 rate;
98 	s32 packet_overhead;
99 	u32 cell_size;
100 	struct reciprocal_value cell_size_reciprocal;
101 	s32 cell_overhead;
102 
103 	struct crndstate {
104 		u32 last;
105 		u32 rho;
106 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
107 
108 	struct disttable *delay_dist;
109 
110 	enum  {
111 		CLG_RANDOM,
112 		CLG_4_STATES,
113 		CLG_GILB_ELL,
114 	} loss_model;
115 
116 	enum {
117 		TX_IN_GAP_PERIOD = 1,
118 		TX_IN_BURST_PERIOD,
119 		LOST_IN_GAP_PERIOD,
120 		LOST_IN_BURST_PERIOD,
121 	} _4_state_model;
122 
123 	enum {
124 		GOOD_STATE = 1,
125 		BAD_STATE,
126 	} GE_state_model;
127 
128 	/* Correlated Loss Generation models */
129 	struct clgstate {
130 		/* state of the Markov chain */
131 		u8 state;
132 
133 		/* 4-states and Gilbert-Elliot models */
134 		u32 a1;	/* p13 for 4-states or p for GE */
135 		u32 a2;	/* p31 for 4-states or r for GE */
136 		u32 a3;	/* p32 for 4-states or h for GE */
137 		u32 a4;	/* p14 for 4-states or 1-k for GE */
138 		u32 a5; /* p23 used only in 4-states */
139 	} clg;
140 
141 	struct tc_netem_slot slot_config;
142 	struct slotstate {
143 		u64 slot_next;
144 		s32 packets_left;
145 		s32 bytes_left;
146 	} slot;
147 
148 	struct disttable *slot_dist;
149 };
150 
151 /* Time stamp put into socket buffer control block
152  * Only valid when skbs are in our internal t(ime)fifo queue.
153  *
154  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
155  * and skb->next & skb->prev are scratch space for a qdisc,
156  * we save skb->tstamp value in skb->cb[] before destroying it.
157  */
158 struct netem_skb_cb {
159 	u64	        time_to_send;
160 };
161 
162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
163 {
164 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
165 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
166 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
167 }
168 
169 /* init_crandom - initialize correlated random number generator
170  * Use entropy source for initial seed.
171  */
172 static void init_crandom(struct crndstate *state, unsigned long rho)
173 {
174 	state->rho = rho;
175 	state->last = get_random_u32();
176 }
177 
178 /* get_crandom - correlated random number generator
179  * Next number depends on last value.
180  * rho is scaled to avoid floating point.
181  */
182 static u32 get_crandom(struct crndstate *state)
183 {
184 	u64 value, rho;
185 	unsigned long answer;
186 
187 	if (!state || state->rho == 0)	/* no correlation */
188 		return get_random_u32();
189 
190 	value = get_random_u32();
191 	rho = (u64)state->rho + 1;
192 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
193 	state->last = answer;
194 	return answer;
195 }
196 
197 /* loss_4state - 4-state model loss generator
198  * Generates losses according to the 4-state Markov chain adopted in
199  * the GI (General and Intuitive) loss model.
200  */
201 static bool loss_4state(struct netem_sched_data *q)
202 {
203 	struct clgstate *clg = &q->clg;
204 	u32 rnd = get_random_u32();
205 
206 	/*
207 	 * Makes a comparison between rnd and the transition
208 	 * probabilities outgoing from the current state, then decides the
209 	 * next state and if the next packet has to be transmitted or lost.
210 	 * The four states correspond to:
211 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
212 	 *   LOST_IN_GAP_PERIOD => isolated losses within a gap period
213 	 *   LOST_IN_BURST_PERIOD => lost packets within a burst period
214 	 *   TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
215 	 */
216 	switch (clg->state) {
217 	case TX_IN_GAP_PERIOD:
218 		if (rnd < clg->a4) {
219 			clg->state = LOST_IN_GAP_PERIOD;
220 			return true;
221 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
222 			clg->state = LOST_IN_BURST_PERIOD;
223 			return true;
224 		} else if (clg->a1 + clg->a4 < rnd) {
225 			clg->state = TX_IN_GAP_PERIOD;
226 		}
227 
228 		break;
229 	case TX_IN_BURST_PERIOD:
230 		if (rnd < clg->a5) {
231 			clg->state = LOST_IN_BURST_PERIOD;
232 			return true;
233 		} else {
234 			clg->state = TX_IN_BURST_PERIOD;
235 		}
236 
237 		break;
238 	case LOST_IN_BURST_PERIOD:
239 		if (rnd < clg->a3)
240 			clg->state = TX_IN_BURST_PERIOD;
241 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
242 			clg->state = TX_IN_GAP_PERIOD;
243 		} else if (clg->a2 + clg->a3 < rnd) {
244 			clg->state = LOST_IN_BURST_PERIOD;
245 			return true;
246 		}
247 		break;
248 	case LOST_IN_GAP_PERIOD:
249 		clg->state = TX_IN_GAP_PERIOD;
250 		break;
251 	}
252 
253 	return false;
254 }
255 
256 /* loss_gilb_ell - Gilbert-Elliot model loss generator
257  * Generates losses according to the Gilbert-Elliot loss model or
258  * its special cases  (Gilbert or Simple Gilbert)
259  *
260  * Makes a comparison between random number and the transition
261  * probabilities outgoing from the current state, then decides the
262  * next state. A second random number is extracted and the comparison
263  * with the loss probability of the current state decides if the next
264  * packet will be transmitted or lost.
265  */
266 static bool loss_gilb_ell(struct netem_sched_data *q)
267 {
268 	struct clgstate *clg = &q->clg;
269 
270 	switch (clg->state) {
271 	case GOOD_STATE:
272 		if (get_random_u32() < clg->a1)
273 			clg->state = BAD_STATE;
274 		if (get_random_u32() < clg->a4)
275 			return true;
276 		break;
277 	case BAD_STATE:
278 		if (get_random_u32() < clg->a2)
279 			clg->state = GOOD_STATE;
280 		if (get_random_u32() > clg->a3)
281 			return true;
282 	}
283 
284 	return false;
285 }
286 
287 static bool loss_event(struct netem_sched_data *q)
288 {
289 	switch (q->loss_model) {
290 	case CLG_RANDOM:
291 		/* Random packet drop 0 => none, ~0 => all */
292 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
293 
294 	case CLG_4_STATES:
295 		/* 4state loss model algorithm (used also for GI model)
296 		* Extracts a value from the markov 4 state loss generator,
297 		* if it is 1 drops a packet and if needed writes the event in
298 		* the kernel logs
299 		*/
300 		return loss_4state(q);
301 
302 	case CLG_GILB_ELL:
303 		/* Gilbert-Elliot loss model algorithm
304 		* Extracts a value from the Gilbert-Elliot loss generator,
305 		* if it is 1 drops a packet and if needed writes the event in
306 		* the kernel logs
307 		*/
308 		return loss_gilb_ell(q);
309 	}
310 
311 	return false;	/* not reached */
312 }
313 
314 
315 /* tabledist - return a pseudo-randomly distributed value with mean mu and
316  * std deviation sigma.  Uses table lookup to approximate the desired
317  * distribution, and a uniformly-distributed pseudo-random source.
318  */
319 static s64 tabledist(s64 mu, s32 sigma,
320 		     struct crndstate *state,
321 		     const struct disttable *dist)
322 {
323 	s64 x;
324 	long t;
325 	u32 rnd;
326 
327 	if (sigma == 0)
328 		return mu;
329 
330 	rnd = get_crandom(state);
331 
332 	/* default uniform distribution */
333 	if (dist == NULL)
334 		return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
335 
336 	t = dist->table[rnd % dist->size];
337 	x = (sigma % NETEM_DIST_SCALE) * t;
338 	if (x >= 0)
339 		x += NETEM_DIST_SCALE/2;
340 	else
341 		x -= NETEM_DIST_SCALE/2;
342 
343 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
344 }
345 
346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
347 {
348 	len += q->packet_overhead;
349 
350 	if (q->cell_size) {
351 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
352 
353 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
354 			cells++;
355 		len = cells * (q->cell_size + q->cell_overhead);
356 	}
357 
358 	return div64_u64(len * NSEC_PER_SEC, q->rate);
359 }
360 
361 static void tfifo_reset(struct Qdisc *sch)
362 {
363 	struct netem_sched_data *q = qdisc_priv(sch);
364 	struct rb_node *p = rb_first(&q->t_root);
365 
366 	while (p) {
367 		struct sk_buff *skb = rb_to_skb(p);
368 
369 		p = rb_next(p);
370 		rb_erase(&skb->rbnode, &q->t_root);
371 		rtnl_kfree_skbs(skb, skb);
372 	}
373 
374 	rtnl_kfree_skbs(q->t_head, q->t_tail);
375 	q->t_head = NULL;
376 	q->t_tail = NULL;
377 }
378 
379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
380 {
381 	struct netem_sched_data *q = qdisc_priv(sch);
382 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
383 
384 	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
385 		if (q->t_tail)
386 			q->t_tail->next = nskb;
387 		else
388 			q->t_head = nskb;
389 		q->t_tail = nskb;
390 	} else {
391 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
392 
393 		while (*p) {
394 			struct sk_buff *skb;
395 
396 			parent = *p;
397 			skb = rb_to_skb(parent);
398 			if (tnext >= netem_skb_cb(skb)->time_to_send)
399 				p = &parent->rb_right;
400 			else
401 				p = &parent->rb_left;
402 		}
403 		rb_link_node(&nskb->rbnode, parent, p);
404 		rb_insert_color(&nskb->rbnode, &q->t_root);
405 	}
406 	sch->q.qlen++;
407 }
408 
409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
410  * when we statistically choose to corrupt one, we instead segment it, returning
411  * the first packet to be corrupted, and re-enqueue the remaining frames
412  */
413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
414 				     struct sk_buff **to_free)
415 {
416 	struct sk_buff *segs;
417 	netdev_features_t features = netif_skb_features(skb);
418 
419 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
420 
421 	if (IS_ERR_OR_NULL(segs)) {
422 		qdisc_drop(skb, sch, to_free);
423 		return NULL;
424 	}
425 	consume_skb(skb);
426 	return segs;
427 }
428 
429 /*
430  * Insert one skb into qdisc.
431  * Note: parent depends on return value to account for queue length.
432  * 	NET_XMIT_DROP: queue length didn't change.
433  *      NET_XMIT_SUCCESS: one skb was queued.
434  */
435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
436 			 struct sk_buff **to_free)
437 {
438 	struct netem_sched_data *q = qdisc_priv(sch);
439 	/* We don't fill cb now as skb_unshare() may invalidate it */
440 	struct netem_skb_cb *cb;
441 	struct sk_buff *skb2;
442 	struct sk_buff *segs = NULL;
443 	unsigned int prev_len = qdisc_pkt_len(skb);
444 	int count = 1;
445 	int rc = NET_XMIT_SUCCESS;
446 	int rc_drop = NET_XMIT_DROP;
447 
448 	/* Do not fool qdisc_drop_all() */
449 	skb->prev = NULL;
450 
451 	/* Random duplication */
452 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
453 		++count;
454 
455 	/* Drop packet? */
456 	if (loss_event(q)) {
457 		if (q->ecn && INET_ECN_set_ce(skb))
458 			qdisc_qstats_drop(sch); /* mark packet */
459 		else
460 			--count;
461 	}
462 	if (count == 0) {
463 		qdisc_qstats_drop(sch);
464 		__qdisc_drop(skb, to_free);
465 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
466 	}
467 
468 	/* If a delay is expected, orphan the skb. (orphaning usually takes
469 	 * place at TX completion time, so _before_ the link transit delay)
470 	 */
471 	if (q->latency || q->jitter || q->rate)
472 		skb_orphan_partial(skb);
473 
474 	/*
475 	 * If we need to duplicate packet, then re-insert at top of the
476 	 * qdisc tree, since parent queuer expects that only one
477 	 * skb will be queued.
478 	 */
479 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
480 		struct Qdisc *rootq = qdisc_root_bh(sch);
481 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
482 
483 		q->duplicate = 0;
484 		rootq->enqueue(skb2, rootq, to_free);
485 		q->duplicate = dupsave;
486 		rc_drop = NET_XMIT_SUCCESS;
487 	}
488 
489 	/*
490 	 * Randomized packet corruption.
491 	 * Make copy if needed since we are modifying
492 	 * If packet is going to be hardware checksummed, then
493 	 * do it now in software before we mangle it.
494 	 */
495 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
496 		if (skb_is_gso(skb)) {
497 			skb = netem_segment(skb, sch, to_free);
498 			if (!skb)
499 				return rc_drop;
500 			segs = skb->next;
501 			skb_mark_not_on_list(skb);
502 			qdisc_skb_cb(skb)->pkt_len = skb->len;
503 		}
504 
505 		skb = skb_unshare(skb, GFP_ATOMIC);
506 		if (unlikely(!skb)) {
507 			qdisc_qstats_drop(sch);
508 			goto finish_segs;
509 		}
510 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
511 		    skb_checksum_help(skb)) {
512 			qdisc_drop(skb, sch, to_free);
513 			skb = NULL;
514 			goto finish_segs;
515 		}
516 
517 		skb->data[get_random_u32_below(skb_headlen(skb))] ^=
518 			1<<get_random_u32_below(8);
519 	}
520 
521 	if (unlikely(sch->q.qlen >= sch->limit)) {
522 		/* re-link segs, so that qdisc_drop_all() frees them all */
523 		skb->next = segs;
524 		qdisc_drop_all(skb, sch, to_free);
525 		return rc_drop;
526 	}
527 
528 	qdisc_qstats_backlog_inc(sch, skb);
529 
530 	cb = netem_skb_cb(skb);
531 	if (q->gap == 0 ||		/* not doing reordering */
532 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
533 	    q->reorder < get_crandom(&q->reorder_cor)) {
534 		u64 now;
535 		s64 delay;
536 
537 		delay = tabledist(q->latency, q->jitter,
538 				  &q->delay_cor, q->delay_dist);
539 
540 		now = ktime_get_ns();
541 
542 		if (q->rate) {
543 			struct netem_skb_cb *last = NULL;
544 
545 			if (sch->q.tail)
546 				last = netem_skb_cb(sch->q.tail);
547 			if (q->t_root.rb_node) {
548 				struct sk_buff *t_skb;
549 				struct netem_skb_cb *t_last;
550 
551 				t_skb = skb_rb_last(&q->t_root);
552 				t_last = netem_skb_cb(t_skb);
553 				if (!last ||
554 				    t_last->time_to_send > last->time_to_send)
555 					last = t_last;
556 			}
557 			if (q->t_tail) {
558 				struct netem_skb_cb *t_last =
559 					netem_skb_cb(q->t_tail);
560 
561 				if (!last ||
562 				    t_last->time_to_send > last->time_to_send)
563 					last = t_last;
564 			}
565 
566 			if (last) {
567 				/*
568 				 * Last packet in queue is reference point (now),
569 				 * calculate this time bonus and subtract
570 				 * from delay.
571 				 */
572 				delay -= last->time_to_send - now;
573 				delay = max_t(s64, 0, delay);
574 				now = last->time_to_send;
575 			}
576 
577 			delay += packet_time_ns(qdisc_pkt_len(skb), q);
578 		}
579 
580 		cb->time_to_send = now + delay;
581 		++q->counter;
582 		tfifo_enqueue(skb, sch);
583 	} else {
584 		/*
585 		 * Do re-ordering by putting one out of N packets at the front
586 		 * of the queue.
587 		 */
588 		cb->time_to_send = ktime_get_ns();
589 		q->counter = 0;
590 
591 		__qdisc_enqueue_head(skb, &sch->q);
592 		sch->qstats.requeues++;
593 	}
594 
595 finish_segs:
596 	if (segs) {
597 		unsigned int len, last_len;
598 		int nb;
599 
600 		len = skb ? skb->len : 0;
601 		nb = skb ? 1 : 0;
602 
603 		while (segs) {
604 			skb2 = segs->next;
605 			skb_mark_not_on_list(segs);
606 			qdisc_skb_cb(segs)->pkt_len = segs->len;
607 			last_len = segs->len;
608 			rc = qdisc_enqueue(segs, sch, to_free);
609 			if (rc != NET_XMIT_SUCCESS) {
610 				if (net_xmit_drop_count(rc))
611 					qdisc_qstats_drop(sch);
612 			} else {
613 				nb++;
614 				len += last_len;
615 			}
616 			segs = skb2;
617 		}
618 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
619 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
620 	} else if (!skb) {
621 		return NET_XMIT_DROP;
622 	}
623 	return NET_XMIT_SUCCESS;
624 }
625 
626 /* Delay the next round with a new future slot with a
627  * correct number of bytes and packets.
628  */
629 
630 static void get_slot_next(struct netem_sched_data *q, u64 now)
631 {
632 	s64 next_delay;
633 
634 	if (!q->slot_dist)
635 		next_delay = q->slot_config.min_delay +
636 				(get_random_u32() *
637 				 (q->slot_config.max_delay -
638 				  q->slot_config.min_delay) >> 32);
639 	else
640 		next_delay = tabledist(q->slot_config.dist_delay,
641 				       (s32)(q->slot_config.dist_jitter),
642 				       NULL, q->slot_dist);
643 
644 	q->slot.slot_next = now + next_delay;
645 	q->slot.packets_left = q->slot_config.max_packets;
646 	q->slot.bytes_left = q->slot_config.max_bytes;
647 }
648 
649 static struct sk_buff *netem_peek(struct netem_sched_data *q)
650 {
651 	struct sk_buff *skb = skb_rb_first(&q->t_root);
652 	u64 t1, t2;
653 
654 	if (!skb)
655 		return q->t_head;
656 	if (!q->t_head)
657 		return skb;
658 
659 	t1 = netem_skb_cb(skb)->time_to_send;
660 	t2 = netem_skb_cb(q->t_head)->time_to_send;
661 	if (t1 < t2)
662 		return skb;
663 	return q->t_head;
664 }
665 
666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
667 {
668 	if (skb == q->t_head) {
669 		q->t_head = skb->next;
670 		if (!q->t_head)
671 			q->t_tail = NULL;
672 	} else {
673 		rb_erase(&skb->rbnode, &q->t_root);
674 	}
675 }
676 
677 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
678 {
679 	struct netem_sched_data *q = qdisc_priv(sch);
680 	struct sk_buff *skb;
681 
682 tfifo_dequeue:
683 	skb = __qdisc_dequeue_head(&sch->q);
684 	if (skb) {
685 		qdisc_qstats_backlog_dec(sch, skb);
686 deliver:
687 		qdisc_bstats_update(sch, skb);
688 		return skb;
689 	}
690 	skb = netem_peek(q);
691 	if (skb) {
692 		u64 time_to_send;
693 		u64 now = ktime_get_ns();
694 
695 		/* if more time remaining? */
696 		time_to_send = netem_skb_cb(skb)->time_to_send;
697 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
698 			get_slot_next(q, now);
699 
700 		if (time_to_send <= now && q->slot.slot_next <= now) {
701 			netem_erase_head(q, skb);
702 			sch->q.qlen--;
703 			qdisc_qstats_backlog_dec(sch, skb);
704 			skb->next = NULL;
705 			skb->prev = NULL;
706 			/* skb->dev shares skb->rbnode area,
707 			 * we need to restore its value.
708 			 */
709 			skb->dev = qdisc_dev(sch);
710 
711 			if (q->slot.slot_next) {
712 				q->slot.packets_left--;
713 				q->slot.bytes_left -= qdisc_pkt_len(skb);
714 				if (q->slot.packets_left <= 0 ||
715 				    q->slot.bytes_left <= 0)
716 					get_slot_next(q, now);
717 			}
718 
719 			if (q->qdisc) {
720 				unsigned int pkt_len = qdisc_pkt_len(skb);
721 				struct sk_buff *to_free = NULL;
722 				int err;
723 
724 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
725 				kfree_skb_list(to_free);
726 				if (err != NET_XMIT_SUCCESS &&
727 				    net_xmit_drop_count(err)) {
728 					qdisc_qstats_drop(sch);
729 					qdisc_tree_reduce_backlog(sch, 1,
730 								  pkt_len);
731 				}
732 				goto tfifo_dequeue;
733 			}
734 			goto deliver;
735 		}
736 
737 		if (q->qdisc) {
738 			skb = q->qdisc->ops->dequeue(q->qdisc);
739 			if (skb)
740 				goto deliver;
741 		}
742 
743 		qdisc_watchdog_schedule_ns(&q->watchdog,
744 					   max(time_to_send,
745 					       q->slot.slot_next));
746 	}
747 
748 	if (q->qdisc) {
749 		skb = q->qdisc->ops->dequeue(q->qdisc);
750 		if (skb)
751 			goto deliver;
752 	}
753 	return NULL;
754 }
755 
756 static void netem_reset(struct Qdisc *sch)
757 {
758 	struct netem_sched_data *q = qdisc_priv(sch);
759 
760 	qdisc_reset_queue(sch);
761 	tfifo_reset(sch);
762 	if (q->qdisc)
763 		qdisc_reset(q->qdisc);
764 	qdisc_watchdog_cancel(&q->watchdog);
765 }
766 
767 static void dist_free(struct disttable *d)
768 {
769 	kvfree(d);
770 }
771 
772 /*
773  * Distribution data is a variable size payload containing
774  * signed 16 bit values.
775  */
776 
777 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
778 			  const struct nlattr *attr)
779 {
780 	size_t n = nla_len(attr)/sizeof(__s16);
781 	const __s16 *data = nla_data(attr);
782 	spinlock_t *root_lock;
783 	struct disttable *d;
784 	int i;
785 
786 	if (!n || n > NETEM_DIST_MAX)
787 		return -EINVAL;
788 
789 	d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
790 	if (!d)
791 		return -ENOMEM;
792 
793 	d->size = n;
794 	for (i = 0; i < n; i++)
795 		d->table[i] = data[i];
796 
797 	root_lock = qdisc_root_sleeping_lock(sch);
798 
799 	spin_lock_bh(root_lock);
800 	swap(*tbl, d);
801 	spin_unlock_bh(root_lock);
802 
803 	dist_free(d);
804 	return 0;
805 }
806 
807 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
808 {
809 	const struct tc_netem_slot *c = nla_data(attr);
810 
811 	q->slot_config = *c;
812 	if (q->slot_config.max_packets == 0)
813 		q->slot_config.max_packets = INT_MAX;
814 	if (q->slot_config.max_bytes == 0)
815 		q->slot_config.max_bytes = INT_MAX;
816 
817 	/* capping dist_jitter to the range acceptable by tabledist() */
818 	q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
819 
820 	q->slot.packets_left = q->slot_config.max_packets;
821 	q->slot.bytes_left = q->slot_config.max_bytes;
822 	if (q->slot_config.min_delay | q->slot_config.max_delay |
823 	    q->slot_config.dist_jitter)
824 		q->slot.slot_next = ktime_get_ns();
825 	else
826 		q->slot.slot_next = 0;
827 }
828 
829 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
830 {
831 	const struct tc_netem_corr *c = nla_data(attr);
832 
833 	init_crandom(&q->delay_cor, c->delay_corr);
834 	init_crandom(&q->loss_cor, c->loss_corr);
835 	init_crandom(&q->dup_cor, c->dup_corr);
836 }
837 
838 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 	const struct tc_netem_reorder *r = nla_data(attr);
841 
842 	q->reorder = r->probability;
843 	init_crandom(&q->reorder_cor, r->correlation);
844 }
845 
846 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 	const struct tc_netem_corrupt *r = nla_data(attr);
849 
850 	q->corrupt = r->probability;
851 	init_crandom(&q->corrupt_cor, r->correlation);
852 }
853 
854 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
855 {
856 	const struct tc_netem_rate *r = nla_data(attr);
857 
858 	q->rate = r->rate;
859 	q->packet_overhead = r->packet_overhead;
860 	q->cell_size = r->cell_size;
861 	q->cell_overhead = r->cell_overhead;
862 	if (q->cell_size)
863 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
864 	else
865 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
866 }
867 
868 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
869 {
870 	const struct nlattr *la;
871 	int rem;
872 
873 	nla_for_each_nested(la, attr, rem) {
874 		u16 type = nla_type(la);
875 
876 		switch (type) {
877 		case NETEM_LOSS_GI: {
878 			const struct tc_netem_gimodel *gi = nla_data(la);
879 
880 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
881 				pr_info("netem: incorrect gi model size\n");
882 				return -EINVAL;
883 			}
884 
885 			q->loss_model = CLG_4_STATES;
886 
887 			q->clg.state = TX_IN_GAP_PERIOD;
888 			q->clg.a1 = gi->p13;
889 			q->clg.a2 = gi->p31;
890 			q->clg.a3 = gi->p32;
891 			q->clg.a4 = gi->p14;
892 			q->clg.a5 = gi->p23;
893 			break;
894 		}
895 
896 		case NETEM_LOSS_GE: {
897 			const struct tc_netem_gemodel *ge = nla_data(la);
898 
899 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
900 				pr_info("netem: incorrect ge model size\n");
901 				return -EINVAL;
902 			}
903 
904 			q->loss_model = CLG_GILB_ELL;
905 			q->clg.state = GOOD_STATE;
906 			q->clg.a1 = ge->p;
907 			q->clg.a2 = ge->r;
908 			q->clg.a3 = ge->h;
909 			q->clg.a4 = ge->k1;
910 			break;
911 		}
912 
913 		default:
914 			pr_info("netem: unknown loss type %u\n", type);
915 			return -EINVAL;
916 		}
917 	}
918 
919 	return 0;
920 }
921 
922 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
923 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
924 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
925 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
926 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
927 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
928 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
929 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
930 	[TCA_NETEM_LATENCY64]	= { .type = NLA_S64 },
931 	[TCA_NETEM_JITTER64]	= { .type = NLA_S64 },
932 	[TCA_NETEM_SLOT]	= { .len = sizeof(struct tc_netem_slot) },
933 };
934 
935 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
936 		      const struct nla_policy *policy, int len)
937 {
938 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
939 
940 	if (nested_len < 0) {
941 		pr_info("netem: invalid attributes len %d\n", nested_len);
942 		return -EINVAL;
943 	}
944 
945 	if (nested_len >= nla_attr_size(0))
946 		return nla_parse_deprecated(tb, maxtype,
947 					    nla_data(nla) + NLA_ALIGN(len),
948 					    nested_len, policy, NULL);
949 
950 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
951 	return 0;
952 }
953 
954 /* Parse netlink message to set options */
955 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
956 			struct netlink_ext_ack *extack)
957 {
958 	struct netem_sched_data *q = qdisc_priv(sch);
959 	struct nlattr *tb[TCA_NETEM_MAX + 1];
960 	struct tc_netem_qopt *qopt;
961 	struct clgstate old_clg;
962 	int old_loss_model = CLG_RANDOM;
963 	int ret;
964 
965 	qopt = nla_data(opt);
966 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
967 	if (ret < 0)
968 		return ret;
969 
970 	sch_tree_lock(sch);
971 	/* backup q->clg and q->loss_model */
972 	old_clg = q->clg;
973 	old_loss_model = q->loss_model;
974 
975 	if (tb[TCA_NETEM_LOSS]) {
976 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
977 		if (ret) {
978 			q->loss_model = old_loss_model;
979 			goto unlock;
980 		}
981 	} else {
982 		q->loss_model = CLG_RANDOM;
983 	}
984 
985 	if (tb[TCA_NETEM_DELAY_DIST]) {
986 		ret = get_dist_table(sch, &q->delay_dist,
987 				     tb[TCA_NETEM_DELAY_DIST]);
988 		if (ret)
989 			goto get_table_failure;
990 	}
991 
992 	if (tb[TCA_NETEM_SLOT_DIST]) {
993 		ret = get_dist_table(sch, &q->slot_dist,
994 				     tb[TCA_NETEM_SLOT_DIST]);
995 		if (ret)
996 			goto get_table_failure;
997 	}
998 
999 	sch->limit = qopt->limit;
1000 
1001 	q->latency = PSCHED_TICKS2NS(qopt->latency);
1002 	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1003 	q->limit = qopt->limit;
1004 	q->gap = qopt->gap;
1005 	q->counter = 0;
1006 	q->loss = qopt->loss;
1007 	q->duplicate = qopt->duplicate;
1008 
1009 	/* for compatibility with earlier versions.
1010 	 * if gap is set, need to assume 100% probability
1011 	 */
1012 	if (q->gap)
1013 		q->reorder = ~0;
1014 
1015 	if (tb[TCA_NETEM_CORR])
1016 		get_correlation(q, tb[TCA_NETEM_CORR]);
1017 
1018 	if (tb[TCA_NETEM_REORDER])
1019 		get_reorder(q, tb[TCA_NETEM_REORDER]);
1020 
1021 	if (tb[TCA_NETEM_CORRUPT])
1022 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1023 
1024 	if (tb[TCA_NETEM_RATE])
1025 		get_rate(q, tb[TCA_NETEM_RATE]);
1026 
1027 	if (tb[TCA_NETEM_RATE64])
1028 		q->rate = max_t(u64, q->rate,
1029 				nla_get_u64(tb[TCA_NETEM_RATE64]));
1030 
1031 	if (tb[TCA_NETEM_LATENCY64])
1032 		q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1033 
1034 	if (tb[TCA_NETEM_JITTER64])
1035 		q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1036 
1037 	if (tb[TCA_NETEM_ECN])
1038 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1039 
1040 	if (tb[TCA_NETEM_SLOT])
1041 		get_slot(q, tb[TCA_NETEM_SLOT]);
1042 
1043 	/* capping jitter to the range acceptable by tabledist() */
1044 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1045 
1046 unlock:
1047 	sch_tree_unlock(sch);
1048 	return ret;
1049 
1050 get_table_failure:
1051 	/* recover clg and loss_model, in case of
1052 	 * q->clg and q->loss_model were modified
1053 	 * in get_loss_clg()
1054 	 */
1055 	q->clg = old_clg;
1056 	q->loss_model = old_loss_model;
1057 
1058 	goto unlock;
1059 }
1060 
1061 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1062 		      struct netlink_ext_ack *extack)
1063 {
1064 	struct netem_sched_data *q = qdisc_priv(sch);
1065 	int ret;
1066 
1067 	qdisc_watchdog_init(&q->watchdog, sch);
1068 
1069 	if (!opt)
1070 		return -EINVAL;
1071 
1072 	q->loss_model = CLG_RANDOM;
1073 	ret = netem_change(sch, opt, extack);
1074 	if (ret)
1075 		pr_info("netem: change failed\n");
1076 	return ret;
1077 }
1078 
1079 static void netem_destroy(struct Qdisc *sch)
1080 {
1081 	struct netem_sched_data *q = qdisc_priv(sch);
1082 
1083 	qdisc_watchdog_cancel(&q->watchdog);
1084 	if (q->qdisc)
1085 		qdisc_put(q->qdisc);
1086 	dist_free(q->delay_dist);
1087 	dist_free(q->slot_dist);
1088 }
1089 
1090 static int dump_loss_model(const struct netem_sched_data *q,
1091 			   struct sk_buff *skb)
1092 {
1093 	struct nlattr *nest;
1094 
1095 	nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1096 	if (nest == NULL)
1097 		goto nla_put_failure;
1098 
1099 	switch (q->loss_model) {
1100 	case CLG_RANDOM:
1101 		/* legacy loss model */
1102 		nla_nest_cancel(skb, nest);
1103 		return 0;	/* no data */
1104 
1105 	case CLG_4_STATES: {
1106 		struct tc_netem_gimodel gi = {
1107 			.p13 = q->clg.a1,
1108 			.p31 = q->clg.a2,
1109 			.p32 = q->clg.a3,
1110 			.p14 = q->clg.a4,
1111 			.p23 = q->clg.a5,
1112 		};
1113 
1114 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1115 			goto nla_put_failure;
1116 		break;
1117 	}
1118 	case CLG_GILB_ELL: {
1119 		struct tc_netem_gemodel ge = {
1120 			.p = q->clg.a1,
1121 			.r = q->clg.a2,
1122 			.h = q->clg.a3,
1123 			.k1 = q->clg.a4,
1124 		};
1125 
1126 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1127 			goto nla_put_failure;
1128 		break;
1129 	}
1130 	}
1131 
1132 	nla_nest_end(skb, nest);
1133 	return 0;
1134 
1135 nla_put_failure:
1136 	nla_nest_cancel(skb, nest);
1137 	return -1;
1138 }
1139 
1140 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1141 {
1142 	const struct netem_sched_data *q = qdisc_priv(sch);
1143 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1144 	struct tc_netem_qopt qopt;
1145 	struct tc_netem_corr cor;
1146 	struct tc_netem_reorder reorder;
1147 	struct tc_netem_corrupt corrupt;
1148 	struct tc_netem_rate rate;
1149 	struct tc_netem_slot slot;
1150 
1151 	qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1152 			     UINT_MAX);
1153 	qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1154 			    UINT_MAX);
1155 	qopt.limit = q->limit;
1156 	qopt.loss = q->loss;
1157 	qopt.gap = q->gap;
1158 	qopt.duplicate = q->duplicate;
1159 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1160 		goto nla_put_failure;
1161 
1162 	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1163 		goto nla_put_failure;
1164 
1165 	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1166 		goto nla_put_failure;
1167 
1168 	cor.delay_corr = q->delay_cor.rho;
1169 	cor.loss_corr = q->loss_cor.rho;
1170 	cor.dup_corr = q->dup_cor.rho;
1171 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1172 		goto nla_put_failure;
1173 
1174 	reorder.probability = q->reorder;
1175 	reorder.correlation = q->reorder_cor.rho;
1176 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1177 		goto nla_put_failure;
1178 
1179 	corrupt.probability = q->corrupt;
1180 	corrupt.correlation = q->corrupt_cor.rho;
1181 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1182 		goto nla_put_failure;
1183 
1184 	if (q->rate >= (1ULL << 32)) {
1185 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1186 				      TCA_NETEM_PAD))
1187 			goto nla_put_failure;
1188 		rate.rate = ~0U;
1189 	} else {
1190 		rate.rate = q->rate;
1191 	}
1192 	rate.packet_overhead = q->packet_overhead;
1193 	rate.cell_size = q->cell_size;
1194 	rate.cell_overhead = q->cell_overhead;
1195 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1196 		goto nla_put_failure;
1197 
1198 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1199 		goto nla_put_failure;
1200 
1201 	if (dump_loss_model(q, skb) != 0)
1202 		goto nla_put_failure;
1203 
1204 	if (q->slot_config.min_delay | q->slot_config.max_delay |
1205 	    q->slot_config.dist_jitter) {
1206 		slot = q->slot_config;
1207 		if (slot.max_packets == INT_MAX)
1208 			slot.max_packets = 0;
1209 		if (slot.max_bytes == INT_MAX)
1210 			slot.max_bytes = 0;
1211 		if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1212 			goto nla_put_failure;
1213 	}
1214 
1215 	return nla_nest_end(skb, nla);
1216 
1217 nla_put_failure:
1218 	nlmsg_trim(skb, nla);
1219 	return -1;
1220 }
1221 
1222 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1223 			  struct sk_buff *skb, struct tcmsg *tcm)
1224 {
1225 	struct netem_sched_data *q = qdisc_priv(sch);
1226 
1227 	if (cl != 1 || !q->qdisc) 	/* only one class */
1228 		return -ENOENT;
1229 
1230 	tcm->tcm_handle |= TC_H_MIN(1);
1231 	tcm->tcm_info = q->qdisc->handle;
1232 
1233 	return 0;
1234 }
1235 
1236 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1237 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1238 {
1239 	struct netem_sched_data *q = qdisc_priv(sch);
1240 
1241 	*old = qdisc_replace(sch, new, &q->qdisc);
1242 	return 0;
1243 }
1244 
1245 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1246 {
1247 	struct netem_sched_data *q = qdisc_priv(sch);
1248 	return q->qdisc;
1249 }
1250 
1251 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1252 {
1253 	return 1;
1254 }
1255 
1256 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1257 {
1258 	if (!walker->stop) {
1259 		if (!tc_qdisc_stats_dump(sch, 1, walker))
1260 			return;
1261 	}
1262 }
1263 
1264 static const struct Qdisc_class_ops netem_class_ops = {
1265 	.graft		=	netem_graft,
1266 	.leaf		=	netem_leaf,
1267 	.find		=	netem_find,
1268 	.walk		=	netem_walk,
1269 	.dump		=	netem_dump_class,
1270 };
1271 
1272 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1273 	.id		=	"netem",
1274 	.cl_ops		=	&netem_class_ops,
1275 	.priv_size	=	sizeof(struct netem_sched_data),
1276 	.enqueue	=	netem_enqueue,
1277 	.dequeue	=	netem_dequeue,
1278 	.peek		=	qdisc_peek_dequeued,
1279 	.init		=	netem_init,
1280 	.reset		=	netem_reset,
1281 	.destroy	=	netem_destroy,
1282 	.change		=	netem_change,
1283 	.dump		=	netem_dump,
1284 	.owner		=	THIS_MODULE,
1285 };
1286 
1287 
1288 static int __init netem_module_init(void)
1289 {
1290 	pr_info("netem: version " VERSION "\n");
1291 	return register_qdisc(&netem_qdisc_ops);
1292 }
1293 static void __exit netem_module_exit(void)
1294 {
1295 	unregister_qdisc(&netem_qdisc_ops);
1296 }
1297 module_init(netem_module_init)
1298 module_exit(netem_module_exit)
1299 MODULE_LICENSE("GPL");
1300