1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * net/sched/sch_netem.c Network emulator
4 *
5 * Many of the algorithms and ideas for this came from
6 * NIST Net which is not copyrighted.
7 *
8 * Authors: Stephen Hemminger <shemminger@osdl.org>
9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10 */
11
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/rtnetlink.h>
21 #include <linux/reciprocal_div.h>
22 #include <linux/rbtree.h>
23
24 #include <net/gso.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 #include <net/inet_ecn.h>
28
29 #define VERSION "1.3"
30
31 /* Network Emulation Queuing algorithm.
32 ====================================
33
34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
35 Network Emulation Tool
36 [2] Luigi Rizzo, DummyNet for FreeBSD
37
38 ----------------------------------------------------------------
39
40 This started out as a simple way to delay outgoing packets to
41 test TCP but has grown to include most of the functionality
42 of a full blown network emulator like NISTnet. It can delay
43 packets and add random jitter (and correlation). The random
44 distribution can be loaded from a table as well to provide
45 normal, Pareto, or experimental curves. Packet loss,
46 duplication, and reordering can also be emulated.
47
48 This qdisc does not do classification that can be handled in
49 layering other disciplines. It does not need to do bandwidth
50 control either since that can be handled by using token
51 bucket or other rate control.
52
53 Correlated Loss Generator models
54
55 Added generation of correlated loss according to the
56 "Gilbert-Elliot" model, a 4-state markov model.
57
58 References:
59 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
60 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
61 and intuitive loss model for packet networks and its implementation
62 in the Netem module in the Linux kernel", available in [1]
63
64 Authors: Stefano Salsano <stefano.salsano at uniroma2.it
65 Fabio Ludovici <fabio.ludovici at yahoo.it>
66 */
67
68 struct disttable {
69 u32 size;
70 s16 table[];
71 };
72
73 struct netem_sched_data {
74 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
75 struct rb_root t_root;
76
77 /* a linear queue; reduces rbtree rebalancing when jitter is low */
78 struct sk_buff *t_head;
79 struct sk_buff *t_tail;
80
81 u32 t_len;
82
83 /* optional qdisc for classful handling (NULL at netem init) */
84 struct Qdisc *qdisc;
85
86 struct qdisc_watchdog watchdog;
87
88 s64 latency;
89 s64 jitter;
90
91 u32 loss;
92 u32 ecn;
93 u32 limit;
94 u32 counter;
95 u32 gap;
96 u32 duplicate;
97 u32 reorder;
98 u32 corrupt;
99 u64 rate;
100 s32 packet_overhead;
101 u32 cell_size;
102 struct reciprocal_value cell_size_reciprocal;
103 s32 cell_overhead;
104
105 struct crndstate {
106 u32 last;
107 u32 rho;
108 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
109
110 struct prng {
111 u64 seed;
112 struct rnd_state prng_state;
113 } prng;
114
115 struct disttable *delay_dist;
116
117 enum {
118 CLG_RANDOM,
119 CLG_4_STATES,
120 CLG_GILB_ELL,
121 } loss_model;
122
123 enum {
124 TX_IN_GAP_PERIOD = 1,
125 TX_IN_BURST_PERIOD,
126 LOST_IN_GAP_PERIOD,
127 LOST_IN_BURST_PERIOD,
128 } _4_state_model;
129
130 enum {
131 GOOD_STATE = 1,
132 BAD_STATE,
133 } GE_state_model;
134
135 /* Correlated Loss Generation models */
136 struct clgstate {
137 /* state of the Markov chain */
138 u8 state;
139
140 /* 4-states and Gilbert-Elliot models */
141 u32 a1; /* p13 for 4-states or p for GE */
142 u32 a2; /* p31 for 4-states or r for GE */
143 u32 a3; /* p32 for 4-states or h for GE */
144 u32 a4; /* p14 for 4-states or 1-k for GE */
145 u32 a5; /* p23 used only in 4-states */
146 } clg;
147
148 struct tc_netem_slot slot_config;
149 struct slotstate {
150 u64 slot_next;
151 s32 packets_left;
152 s32 bytes_left;
153 } slot;
154
155 struct disttable *slot_dist;
156 };
157
158 /* Time stamp put into socket buffer control block
159 * Only valid when skbs are in our internal t(ime)fifo queue.
160 *
161 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
162 * and skb->next & skb->prev are scratch space for a qdisc,
163 * we save skb->tstamp value in skb->cb[] before destroying it.
164 */
165 struct netem_skb_cb {
166 u64 time_to_send;
167 };
168
netem_skb_cb(struct sk_buff * skb)169 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
170 {
171 /* we assume we can use skb next/prev/tstamp as storage for rb_node */
172 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
173 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
174 }
175
176 /* init_crandom - initialize correlated random number generator
177 * Use entropy source for initial seed.
178 */
init_crandom(struct crndstate * state,unsigned long rho)179 static void init_crandom(struct crndstate *state, unsigned long rho)
180 {
181 state->rho = rho;
182 state->last = get_random_u32();
183 }
184
185 /* get_crandom - correlated random number generator
186 * Next number depends on last value.
187 * rho is scaled to avoid floating point.
188 */
get_crandom(struct crndstate * state,struct prng * p)189 static u32 get_crandom(struct crndstate *state, struct prng *p)
190 {
191 u64 value, rho;
192 unsigned long answer;
193 struct rnd_state *s = &p->prng_state;
194
195 if (!state || state->rho == 0) /* no correlation */
196 return prandom_u32_state(s);
197
198 value = prandom_u32_state(s);
199 rho = (u64)state->rho + 1;
200 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
201 state->last = answer;
202 return answer;
203 }
204
205 /* loss_4state - 4-state model loss generator
206 * Generates losses according to the 4-state Markov chain adopted in
207 * the GI (General and Intuitive) loss model.
208 */
loss_4state(struct netem_sched_data * q)209 static bool loss_4state(struct netem_sched_data *q)
210 {
211 struct clgstate *clg = &q->clg;
212 u32 rnd = prandom_u32_state(&q->prng.prng_state);
213
214 /*
215 * Makes a comparison between rnd and the transition
216 * probabilities outgoing from the current state, then decides the
217 * next state and if the next packet has to be transmitted or lost.
218 * The four states correspond to:
219 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
220 * LOST_IN_GAP_PERIOD => isolated losses within a gap period
221 * LOST_IN_BURST_PERIOD => lost packets within a burst period
222 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period
223 */
224 switch (clg->state) {
225 case TX_IN_GAP_PERIOD:
226 if (rnd < clg->a4) {
227 clg->state = LOST_IN_GAP_PERIOD;
228 return true;
229 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
230 clg->state = LOST_IN_BURST_PERIOD;
231 return true;
232 } else if (clg->a1 + clg->a4 < rnd) {
233 clg->state = TX_IN_GAP_PERIOD;
234 }
235
236 break;
237 case TX_IN_BURST_PERIOD:
238 if (rnd < clg->a5) {
239 clg->state = LOST_IN_BURST_PERIOD;
240 return true;
241 } else {
242 clg->state = TX_IN_BURST_PERIOD;
243 }
244
245 break;
246 case LOST_IN_BURST_PERIOD:
247 if (rnd < clg->a3)
248 clg->state = TX_IN_BURST_PERIOD;
249 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
250 clg->state = TX_IN_GAP_PERIOD;
251 } else if (clg->a2 + clg->a3 < rnd) {
252 clg->state = LOST_IN_BURST_PERIOD;
253 return true;
254 }
255 break;
256 case LOST_IN_GAP_PERIOD:
257 clg->state = TX_IN_GAP_PERIOD;
258 break;
259 }
260
261 return false;
262 }
263
264 /* loss_gilb_ell - Gilbert-Elliot model loss generator
265 * Generates losses according to the Gilbert-Elliot loss model or
266 * its special cases (Gilbert or Simple Gilbert)
267 *
268 * Makes a comparison between random number and the transition
269 * probabilities outgoing from the current state, then decides the
270 * next state. A second random number is extracted and the comparison
271 * with the loss probability of the current state decides if the next
272 * packet will be transmitted or lost.
273 */
loss_gilb_ell(struct netem_sched_data * q)274 static bool loss_gilb_ell(struct netem_sched_data *q)
275 {
276 struct clgstate *clg = &q->clg;
277 struct rnd_state *s = &q->prng.prng_state;
278
279 switch (clg->state) {
280 case GOOD_STATE:
281 if (prandom_u32_state(s) < clg->a1)
282 clg->state = BAD_STATE;
283 if (prandom_u32_state(s) < clg->a4)
284 return true;
285 break;
286 case BAD_STATE:
287 if (prandom_u32_state(s) < clg->a2)
288 clg->state = GOOD_STATE;
289 if (prandom_u32_state(s) > clg->a3)
290 return true;
291 }
292
293 return false;
294 }
295
loss_event(struct netem_sched_data * q)296 static bool loss_event(struct netem_sched_data *q)
297 {
298 switch (q->loss_model) {
299 case CLG_RANDOM:
300 /* Random packet drop 0 => none, ~0 => all */
301 return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng);
302
303 case CLG_4_STATES:
304 /* 4state loss model algorithm (used also for GI model)
305 * Extracts a value from the markov 4 state loss generator,
306 * if it is 1 drops a packet and if needed writes the event in
307 * the kernel logs
308 */
309 return loss_4state(q);
310
311 case CLG_GILB_ELL:
312 /* Gilbert-Elliot loss model algorithm
313 * Extracts a value from the Gilbert-Elliot loss generator,
314 * if it is 1 drops a packet and if needed writes the event in
315 * the kernel logs
316 */
317 return loss_gilb_ell(q);
318 }
319
320 return false; /* not reached */
321 }
322
323
324 /* tabledist - return a pseudo-randomly distributed value with mean mu and
325 * std deviation sigma. Uses table lookup to approximate the desired
326 * distribution, and a uniformly-distributed pseudo-random source.
327 */
tabledist(s64 mu,s32 sigma,struct crndstate * state,struct prng * prng,const struct disttable * dist)328 static s64 tabledist(s64 mu, s32 sigma,
329 struct crndstate *state,
330 struct prng *prng,
331 const struct disttable *dist)
332 {
333 s64 x;
334 long t;
335 u32 rnd;
336
337 if (sigma == 0)
338 return mu;
339
340 rnd = get_crandom(state, prng);
341
342 /* default uniform distribution */
343 if (dist == NULL)
344 return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
345
346 t = dist->table[rnd % dist->size];
347 x = (sigma % NETEM_DIST_SCALE) * t;
348 if (x >= 0)
349 x += NETEM_DIST_SCALE/2;
350 else
351 x -= NETEM_DIST_SCALE/2;
352
353 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
354 }
355
packet_time_ns(u64 len,const struct netem_sched_data * q)356 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
357 {
358 len += q->packet_overhead;
359
360 if (q->cell_size) {
361 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
362
363 if (len > cells * q->cell_size) /* extra cell needed for remainder */
364 cells++;
365 len = cells * (q->cell_size + q->cell_overhead);
366 }
367
368 return div64_u64(len * NSEC_PER_SEC, q->rate);
369 }
370
tfifo_reset(struct Qdisc * sch)371 static void tfifo_reset(struct Qdisc *sch)
372 {
373 struct netem_sched_data *q = qdisc_priv(sch);
374 struct rb_node *p = rb_first(&q->t_root);
375
376 while (p) {
377 struct sk_buff *skb = rb_to_skb(p);
378
379 p = rb_next(p);
380 rb_erase(&skb->rbnode, &q->t_root);
381 rtnl_kfree_skbs(skb, skb);
382 }
383
384 rtnl_kfree_skbs(q->t_head, q->t_tail);
385 q->t_head = NULL;
386 q->t_tail = NULL;
387 q->t_len = 0;
388 }
389
tfifo_enqueue(struct sk_buff * nskb,struct Qdisc * sch)390 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
391 {
392 struct netem_sched_data *q = qdisc_priv(sch);
393 u64 tnext = netem_skb_cb(nskb)->time_to_send;
394
395 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
396 if (q->t_tail)
397 q->t_tail->next = nskb;
398 else
399 q->t_head = nskb;
400 q->t_tail = nskb;
401 } else {
402 struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
403
404 while (*p) {
405 struct sk_buff *skb;
406
407 parent = *p;
408 skb = rb_to_skb(parent);
409 if (tnext >= netem_skb_cb(skb)->time_to_send)
410 p = &parent->rb_right;
411 else
412 p = &parent->rb_left;
413 }
414 rb_link_node(&nskb->rbnode, parent, p);
415 rb_insert_color(&nskb->rbnode, &q->t_root);
416 }
417 q->t_len++;
418 sch->q.qlen++;
419 }
420
421 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
422 * when we statistically choose to corrupt one, we instead segment it, returning
423 * the first packet to be corrupted, and re-enqueue the remaining frames
424 */
netem_segment(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)425 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
426 struct sk_buff **to_free)
427 {
428 struct sk_buff *segs;
429 netdev_features_t features = netif_skb_features(skb);
430
431 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
432
433 if (IS_ERR_OR_NULL(segs)) {
434 qdisc_drop(skb, sch, to_free);
435 return NULL;
436 }
437 consume_skb(skb);
438 return segs;
439 }
440
441 /*
442 * Insert one skb into qdisc.
443 * Note: parent depends on return value to account for queue length.
444 * NET_XMIT_DROP: queue length didn't change.
445 * NET_XMIT_SUCCESS: one skb was queued.
446 */
netem_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)447 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
448 struct sk_buff **to_free)
449 {
450 struct netem_sched_data *q = qdisc_priv(sch);
451 /* We don't fill cb now as skb_unshare() may invalidate it */
452 struct netem_skb_cb *cb;
453 struct sk_buff *skb2 = NULL;
454 struct sk_buff *segs = NULL;
455 unsigned int prev_len = qdisc_pkt_len(skb);
456 int count = 1;
457
458 /* Do not fool qdisc_drop_all() */
459 skb->prev = NULL;
460
461 /* Random duplication */
462 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
463 ++count;
464
465 /* Drop packet? */
466 if (loss_event(q)) {
467 if (q->ecn && INET_ECN_set_ce(skb))
468 qdisc_qstats_drop(sch); /* mark packet */
469 else
470 --count;
471 }
472 if (count == 0) {
473 qdisc_qstats_drop(sch);
474 __qdisc_drop(skb, to_free);
475 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
476 }
477
478 /* If a delay is expected, orphan the skb. (orphaning usually takes
479 * place at TX completion time, so _before_ the link transit delay)
480 */
481 if (q->latency || q->jitter || q->rate)
482 skb_orphan_partial(skb);
483
484 /*
485 * If we need to duplicate packet, then clone it before
486 * original is modified.
487 */
488 if (count > 1)
489 skb2 = skb_clone(skb, GFP_ATOMIC);
490
491 /*
492 * Randomized packet corruption.
493 * Make copy if needed since we are modifying
494 * If packet is going to be hardware checksummed, then
495 * do it now in software before we mangle it.
496 */
497 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) {
498 if (skb_is_gso(skb)) {
499 skb = netem_segment(skb, sch, to_free);
500 if (!skb)
501 goto finish_segs;
502
503 segs = skb->next;
504 skb_mark_not_on_list(skb);
505 qdisc_skb_cb(skb)->pkt_len = skb->len;
506 }
507
508 skb = skb_unshare(skb, GFP_ATOMIC);
509 if (unlikely(!skb)) {
510 qdisc_qstats_drop(sch);
511 goto finish_segs;
512 }
513 if (skb->ip_summed == CHECKSUM_PARTIAL &&
514 skb_checksum_help(skb)) {
515 qdisc_drop(skb, sch, to_free);
516 skb = NULL;
517 goto finish_segs;
518 }
519
520 skb->data[get_random_u32_below(skb_headlen(skb))] ^=
521 1<<get_random_u32_below(8);
522 }
523
524 if (unlikely(q->t_len >= sch->limit)) {
525 /* re-link segs, so that qdisc_drop_all() frees them all */
526 skb->next = segs;
527 qdisc_drop_all(skb, sch, to_free);
528 if (skb2)
529 __qdisc_drop(skb2, to_free);
530 return NET_XMIT_DROP;
531 }
532
533 /*
534 * If doing duplication then re-insert at top of the
535 * qdisc tree, since parent queuer expects that only one
536 * skb will be queued.
537 */
538 if (skb2) {
539 struct Qdisc *rootq = qdisc_root_bh(sch);
540 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
541
542 q->duplicate = 0;
543 rootq->enqueue(skb2, rootq, to_free);
544 q->duplicate = dupsave;
545 skb2 = NULL;
546 }
547
548 qdisc_qstats_backlog_inc(sch, skb);
549
550 cb = netem_skb_cb(skb);
551 if (q->gap == 0 || /* not doing reordering */
552 q->counter < q->gap - 1 || /* inside last reordering gap */
553 q->reorder < get_crandom(&q->reorder_cor, &q->prng)) {
554 u64 now;
555 s64 delay;
556
557 delay = tabledist(q->latency, q->jitter,
558 &q->delay_cor, &q->prng, q->delay_dist);
559
560 now = ktime_get_ns();
561
562 if (q->rate) {
563 struct netem_skb_cb *last = NULL;
564
565 if (sch->q.tail)
566 last = netem_skb_cb(sch->q.tail);
567 if (q->t_root.rb_node) {
568 struct sk_buff *t_skb;
569 struct netem_skb_cb *t_last;
570
571 t_skb = skb_rb_last(&q->t_root);
572 t_last = netem_skb_cb(t_skb);
573 if (!last ||
574 t_last->time_to_send > last->time_to_send)
575 last = t_last;
576 }
577 if (q->t_tail) {
578 struct netem_skb_cb *t_last =
579 netem_skb_cb(q->t_tail);
580
581 if (!last ||
582 t_last->time_to_send > last->time_to_send)
583 last = t_last;
584 }
585
586 if (last) {
587 /*
588 * Last packet in queue is reference point (now),
589 * calculate this time bonus and subtract
590 * from delay.
591 */
592 delay -= last->time_to_send - now;
593 delay = max_t(s64, 0, delay);
594 now = last->time_to_send;
595 }
596
597 delay += packet_time_ns(qdisc_pkt_len(skb), q);
598 }
599
600 cb->time_to_send = now + delay;
601 ++q->counter;
602 tfifo_enqueue(skb, sch);
603 } else {
604 /*
605 * Do re-ordering by putting one out of N packets at the front
606 * of the queue.
607 */
608 cb->time_to_send = ktime_get_ns();
609 q->counter = 0;
610
611 __qdisc_enqueue_head(skb, &sch->q);
612 sch->qstats.requeues++;
613 }
614
615 finish_segs:
616 if (skb2)
617 __qdisc_drop(skb2, to_free);
618
619 if (segs) {
620 unsigned int len, last_len;
621 int rc, nb;
622
623 len = skb ? skb->len : 0;
624 nb = skb ? 1 : 0;
625
626 while (segs) {
627 skb2 = segs->next;
628 skb_mark_not_on_list(segs);
629 qdisc_skb_cb(segs)->pkt_len = segs->len;
630 last_len = segs->len;
631 rc = qdisc_enqueue(segs, sch, to_free);
632 if (rc != NET_XMIT_SUCCESS) {
633 if (net_xmit_drop_count(rc))
634 qdisc_qstats_drop(sch);
635 } else {
636 nb++;
637 len += last_len;
638 }
639 segs = skb2;
640 }
641 /* Parent qdiscs accounted for 1 skb of size @prev_len */
642 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
643 } else if (!skb) {
644 return NET_XMIT_DROP;
645 }
646 return NET_XMIT_SUCCESS;
647 }
648
649 /* Delay the next round with a new future slot with a
650 * correct number of bytes and packets.
651 */
652
get_slot_next(struct netem_sched_data * q,u64 now)653 static void get_slot_next(struct netem_sched_data *q, u64 now)
654 {
655 s64 next_delay;
656
657 if (!q->slot_dist)
658 next_delay = q->slot_config.min_delay +
659 (get_random_u32() *
660 (q->slot_config.max_delay -
661 q->slot_config.min_delay) >> 32);
662 else
663 next_delay = tabledist(q->slot_config.dist_delay,
664 (s32)(q->slot_config.dist_jitter),
665 NULL, &q->prng, q->slot_dist);
666
667 q->slot.slot_next = now + next_delay;
668 q->slot.packets_left = q->slot_config.max_packets;
669 q->slot.bytes_left = q->slot_config.max_bytes;
670 }
671
netem_peek(struct netem_sched_data * q)672 static struct sk_buff *netem_peek(struct netem_sched_data *q)
673 {
674 struct sk_buff *skb = skb_rb_first(&q->t_root);
675 u64 t1, t2;
676
677 if (!skb)
678 return q->t_head;
679 if (!q->t_head)
680 return skb;
681
682 t1 = netem_skb_cb(skb)->time_to_send;
683 t2 = netem_skb_cb(q->t_head)->time_to_send;
684 if (t1 < t2)
685 return skb;
686 return q->t_head;
687 }
688
netem_erase_head(struct netem_sched_data * q,struct sk_buff * skb)689 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
690 {
691 if (skb == q->t_head) {
692 q->t_head = skb->next;
693 if (!q->t_head)
694 q->t_tail = NULL;
695 } else {
696 rb_erase(&skb->rbnode, &q->t_root);
697 }
698 }
699
netem_dequeue(struct Qdisc * sch)700 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
701 {
702 struct netem_sched_data *q = qdisc_priv(sch);
703 struct sk_buff *skb;
704
705 tfifo_dequeue:
706 skb = __qdisc_dequeue_head(&sch->q);
707 if (skb) {
708 deliver:
709 qdisc_qstats_backlog_dec(sch, skb);
710 qdisc_bstats_update(sch, skb);
711 return skb;
712 }
713 skb = netem_peek(q);
714 if (skb) {
715 u64 time_to_send;
716 u64 now = ktime_get_ns();
717
718 /* if more time remaining? */
719 time_to_send = netem_skb_cb(skb)->time_to_send;
720 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
721 get_slot_next(q, now);
722
723 if (time_to_send <= now && q->slot.slot_next <= now) {
724 netem_erase_head(q, skb);
725 q->t_len--;
726 skb->next = NULL;
727 skb->prev = NULL;
728 /* skb->dev shares skb->rbnode area,
729 * we need to restore its value.
730 */
731 skb->dev = qdisc_dev(sch);
732
733 if (q->slot.slot_next) {
734 q->slot.packets_left--;
735 q->slot.bytes_left -= qdisc_pkt_len(skb);
736 if (q->slot.packets_left <= 0 ||
737 q->slot.bytes_left <= 0)
738 get_slot_next(q, now);
739 }
740
741 if (q->qdisc) {
742 unsigned int pkt_len = qdisc_pkt_len(skb);
743 struct sk_buff *to_free = NULL;
744 int err;
745
746 err = qdisc_enqueue(skb, q->qdisc, &to_free);
747 kfree_skb_list(to_free);
748 if (err != NET_XMIT_SUCCESS) {
749 if (net_xmit_drop_count(err))
750 qdisc_qstats_drop(sch);
751 qdisc_tree_reduce_backlog(sch, 1, pkt_len);
752 sch->qstats.backlog -= pkt_len;
753 sch->q.qlen--;
754 }
755 goto tfifo_dequeue;
756 }
757 sch->q.qlen--;
758 goto deliver;
759 }
760
761 if (q->qdisc) {
762 skb = q->qdisc->ops->dequeue(q->qdisc);
763 if (skb) {
764 sch->q.qlen--;
765 goto deliver;
766 }
767 }
768
769 qdisc_watchdog_schedule_ns(&q->watchdog,
770 max(time_to_send,
771 q->slot.slot_next));
772 }
773
774 if (q->qdisc) {
775 skb = q->qdisc->ops->dequeue(q->qdisc);
776 if (skb) {
777 sch->q.qlen--;
778 goto deliver;
779 }
780 }
781 return NULL;
782 }
783
netem_reset(struct Qdisc * sch)784 static void netem_reset(struct Qdisc *sch)
785 {
786 struct netem_sched_data *q = qdisc_priv(sch);
787
788 qdisc_reset_queue(sch);
789 tfifo_reset(sch);
790 if (q->qdisc)
791 qdisc_reset(q->qdisc);
792 qdisc_watchdog_cancel(&q->watchdog);
793 }
794
dist_free(struct disttable * d)795 static void dist_free(struct disttable *d)
796 {
797 kvfree(d);
798 }
799
800 /*
801 * Distribution data is a variable size payload containing
802 * signed 16 bit values.
803 */
804
get_dist_table(struct disttable ** tbl,const struct nlattr * attr)805 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
806 {
807 size_t n = nla_len(attr)/sizeof(__s16);
808 const __s16 *data = nla_data(attr);
809 struct disttable *d;
810 int i;
811
812 if (!n || n > NETEM_DIST_MAX)
813 return -EINVAL;
814
815 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL);
816 if (!d)
817 return -ENOMEM;
818
819 d->size = n;
820 for (i = 0; i < n; i++)
821 d->table[i] = data[i];
822
823 *tbl = d;
824 return 0;
825 }
826
get_slot(struct netem_sched_data * q,const struct nlattr * attr)827 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
828 {
829 const struct tc_netem_slot *c = nla_data(attr);
830
831 q->slot_config = *c;
832 if (q->slot_config.max_packets == 0)
833 q->slot_config.max_packets = INT_MAX;
834 if (q->slot_config.max_bytes == 0)
835 q->slot_config.max_bytes = INT_MAX;
836
837 /* capping dist_jitter to the range acceptable by tabledist() */
838 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
839
840 q->slot.packets_left = q->slot_config.max_packets;
841 q->slot.bytes_left = q->slot_config.max_bytes;
842 if (q->slot_config.min_delay | q->slot_config.max_delay |
843 q->slot_config.dist_jitter)
844 q->slot.slot_next = ktime_get_ns();
845 else
846 q->slot.slot_next = 0;
847 }
848
get_correlation(struct netem_sched_data * q,const struct nlattr * attr)849 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
850 {
851 const struct tc_netem_corr *c = nla_data(attr);
852
853 init_crandom(&q->delay_cor, c->delay_corr);
854 init_crandom(&q->loss_cor, c->loss_corr);
855 init_crandom(&q->dup_cor, c->dup_corr);
856 }
857
get_reorder(struct netem_sched_data * q,const struct nlattr * attr)858 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
859 {
860 const struct tc_netem_reorder *r = nla_data(attr);
861
862 q->reorder = r->probability;
863 init_crandom(&q->reorder_cor, r->correlation);
864 }
865
get_corrupt(struct netem_sched_data * q,const struct nlattr * attr)866 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
867 {
868 const struct tc_netem_corrupt *r = nla_data(attr);
869
870 q->corrupt = r->probability;
871 init_crandom(&q->corrupt_cor, r->correlation);
872 }
873
get_rate(struct netem_sched_data * q,const struct nlattr * attr)874 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
875 {
876 const struct tc_netem_rate *r = nla_data(attr);
877
878 q->rate = r->rate;
879 q->packet_overhead = r->packet_overhead;
880 q->cell_size = r->cell_size;
881 q->cell_overhead = r->cell_overhead;
882 if (q->cell_size)
883 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
884 else
885 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
886 }
887
get_loss_clg(struct netem_sched_data * q,const struct nlattr * attr)888 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
889 {
890 const struct nlattr *la;
891 int rem;
892
893 nla_for_each_nested(la, attr, rem) {
894 u16 type = nla_type(la);
895
896 switch (type) {
897 case NETEM_LOSS_GI: {
898 const struct tc_netem_gimodel *gi = nla_data(la);
899
900 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
901 pr_info("netem: incorrect gi model size\n");
902 return -EINVAL;
903 }
904
905 q->loss_model = CLG_4_STATES;
906
907 q->clg.state = TX_IN_GAP_PERIOD;
908 q->clg.a1 = gi->p13;
909 q->clg.a2 = gi->p31;
910 q->clg.a3 = gi->p32;
911 q->clg.a4 = gi->p14;
912 q->clg.a5 = gi->p23;
913 break;
914 }
915
916 case NETEM_LOSS_GE: {
917 const struct tc_netem_gemodel *ge = nla_data(la);
918
919 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
920 pr_info("netem: incorrect ge model size\n");
921 return -EINVAL;
922 }
923
924 q->loss_model = CLG_GILB_ELL;
925 q->clg.state = GOOD_STATE;
926 q->clg.a1 = ge->p;
927 q->clg.a2 = ge->r;
928 q->clg.a3 = ge->h;
929 q->clg.a4 = ge->k1;
930 break;
931 }
932
933 default:
934 pr_info("netem: unknown loss type %u\n", type);
935 return -EINVAL;
936 }
937 }
938
939 return 0;
940 }
941
942 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
943 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
944 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
945 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
946 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
947 [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
948 [TCA_NETEM_ECN] = { .type = NLA_U32 },
949 [TCA_NETEM_RATE64] = { .type = NLA_U64 },
950 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
951 [TCA_NETEM_JITTER64] = { .type = NLA_S64 },
952 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
953 [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 },
954 };
955
parse_attr(struct nlattr * tb[],int maxtype,struct nlattr * nla,const struct nla_policy * policy,int len)956 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
957 const struct nla_policy *policy, int len)
958 {
959 int nested_len = nla_len(nla) - NLA_ALIGN(len);
960
961 if (nested_len < 0) {
962 pr_info("netem: invalid attributes len %d\n", nested_len);
963 return -EINVAL;
964 }
965
966 if (nested_len >= nla_attr_size(0))
967 return nla_parse_deprecated(tb, maxtype,
968 nla_data(nla) + NLA_ALIGN(len),
969 nested_len, policy, NULL);
970
971 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
972 return 0;
973 }
974
975 /* Parse netlink message to set options */
netem_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)976 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
977 struct netlink_ext_ack *extack)
978 {
979 struct netem_sched_data *q = qdisc_priv(sch);
980 struct nlattr *tb[TCA_NETEM_MAX + 1];
981 struct disttable *delay_dist = NULL;
982 struct disttable *slot_dist = NULL;
983 struct tc_netem_qopt *qopt;
984 struct clgstate old_clg;
985 int old_loss_model = CLG_RANDOM;
986 int ret;
987
988 qopt = nla_data(opt);
989 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
990 if (ret < 0)
991 return ret;
992
993 if (tb[TCA_NETEM_DELAY_DIST]) {
994 ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
995 if (ret)
996 goto table_free;
997 }
998
999 if (tb[TCA_NETEM_SLOT_DIST]) {
1000 ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
1001 if (ret)
1002 goto table_free;
1003 }
1004
1005 sch_tree_lock(sch);
1006 /* backup q->clg and q->loss_model */
1007 old_clg = q->clg;
1008 old_loss_model = q->loss_model;
1009
1010 if (tb[TCA_NETEM_LOSS]) {
1011 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
1012 if (ret) {
1013 q->loss_model = old_loss_model;
1014 q->clg = old_clg;
1015 goto unlock;
1016 }
1017 } else {
1018 q->loss_model = CLG_RANDOM;
1019 }
1020
1021 if (delay_dist)
1022 swap(q->delay_dist, delay_dist);
1023 if (slot_dist)
1024 swap(q->slot_dist, slot_dist);
1025 sch->limit = qopt->limit;
1026
1027 q->latency = PSCHED_TICKS2NS(qopt->latency);
1028 q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1029 q->limit = qopt->limit;
1030 q->gap = qopt->gap;
1031 q->counter = 0;
1032 q->loss = qopt->loss;
1033 q->duplicate = qopt->duplicate;
1034
1035 /* for compatibility with earlier versions.
1036 * if gap is set, need to assume 100% probability
1037 */
1038 if (q->gap)
1039 q->reorder = ~0;
1040
1041 if (tb[TCA_NETEM_CORR])
1042 get_correlation(q, tb[TCA_NETEM_CORR]);
1043
1044 if (tb[TCA_NETEM_REORDER])
1045 get_reorder(q, tb[TCA_NETEM_REORDER]);
1046
1047 if (tb[TCA_NETEM_CORRUPT])
1048 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1049
1050 if (tb[TCA_NETEM_RATE])
1051 get_rate(q, tb[TCA_NETEM_RATE]);
1052
1053 if (tb[TCA_NETEM_RATE64])
1054 q->rate = max_t(u64, q->rate,
1055 nla_get_u64(tb[TCA_NETEM_RATE64]));
1056
1057 if (tb[TCA_NETEM_LATENCY64])
1058 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1059
1060 if (tb[TCA_NETEM_JITTER64])
1061 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1062
1063 if (tb[TCA_NETEM_ECN])
1064 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1065
1066 if (tb[TCA_NETEM_SLOT])
1067 get_slot(q, tb[TCA_NETEM_SLOT]);
1068
1069 /* capping jitter to the range acceptable by tabledist() */
1070 q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1071
1072 if (tb[TCA_NETEM_PRNG_SEED])
1073 q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
1074 else
1075 q->prng.seed = get_random_u64();
1076 prandom_seed_state(&q->prng.prng_state, q->prng.seed);
1077
1078 unlock:
1079 sch_tree_unlock(sch);
1080
1081 table_free:
1082 dist_free(delay_dist);
1083 dist_free(slot_dist);
1084 return ret;
1085 }
1086
netem_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1087 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1088 struct netlink_ext_ack *extack)
1089 {
1090 struct netem_sched_data *q = qdisc_priv(sch);
1091 int ret;
1092
1093 qdisc_watchdog_init(&q->watchdog, sch);
1094
1095 if (!opt)
1096 return -EINVAL;
1097
1098 q->loss_model = CLG_RANDOM;
1099 ret = netem_change(sch, opt, extack);
1100 if (ret)
1101 pr_info("netem: change failed\n");
1102 return ret;
1103 }
1104
netem_destroy(struct Qdisc * sch)1105 static void netem_destroy(struct Qdisc *sch)
1106 {
1107 struct netem_sched_data *q = qdisc_priv(sch);
1108
1109 qdisc_watchdog_cancel(&q->watchdog);
1110 if (q->qdisc)
1111 qdisc_put(q->qdisc);
1112 dist_free(q->delay_dist);
1113 dist_free(q->slot_dist);
1114 }
1115
dump_loss_model(const struct netem_sched_data * q,struct sk_buff * skb)1116 static int dump_loss_model(const struct netem_sched_data *q,
1117 struct sk_buff *skb)
1118 {
1119 struct nlattr *nest;
1120
1121 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1122 if (nest == NULL)
1123 goto nla_put_failure;
1124
1125 switch (q->loss_model) {
1126 case CLG_RANDOM:
1127 /* legacy loss model */
1128 nla_nest_cancel(skb, nest);
1129 return 0; /* no data */
1130
1131 case CLG_4_STATES: {
1132 struct tc_netem_gimodel gi = {
1133 .p13 = q->clg.a1,
1134 .p31 = q->clg.a2,
1135 .p32 = q->clg.a3,
1136 .p14 = q->clg.a4,
1137 .p23 = q->clg.a5,
1138 };
1139
1140 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1141 goto nla_put_failure;
1142 break;
1143 }
1144 case CLG_GILB_ELL: {
1145 struct tc_netem_gemodel ge = {
1146 .p = q->clg.a1,
1147 .r = q->clg.a2,
1148 .h = q->clg.a3,
1149 .k1 = q->clg.a4,
1150 };
1151
1152 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1153 goto nla_put_failure;
1154 break;
1155 }
1156 }
1157
1158 nla_nest_end(skb, nest);
1159 return 0;
1160
1161 nla_put_failure:
1162 nla_nest_cancel(skb, nest);
1163 return -1;
1164 }
1165
netem_dump(struct Qdisc * sch,struct sk_buff * skb)1166 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1167 {
1168 const struct netem_sched_data *q = qdisc_priv(sch);
1169 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1170 struct tc_netem_qopt qopt;
1171 struct tc_netem_corr cor;
1172 struct tc_netem_reorder reorder;
1173 struct tc_netem_corrupt corrupt;
1174 struct tc_netem_rate rate;
1175 struct tc_netem_slot slot;
1176
1177 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1178 UINT_MAX);
1179 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1180 UINT_MAX);
1181 qopt.limit = q->limit;
1182 qopt.loss = q->loss;
1183 qopt.gap = q->gap;
1184 qopt.duplicate = q->duplicate;
1185 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1186 goto nla_put_failure;
1187
1188 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1189 goto nla_put_failure;
1190
1191 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1192 goto nla_put_failure;
1193
1194 cor.delay_corr = q->delay_cor.rho;
1195 cor.loss_corr = q->loss_cor.rho;
1196 cor.dup_corr = q->dup_cor.rho;
1197 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1198 goto nla_put_failure;
1199
1200 reorder.probability = q->reorder;
1201 reorder.correlation = q->reorder_cor.rho;
1202 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1203 goto nla_put_failure;
1204
1205 corrupt.probability = q->corrupt;
1206 corrupt.correlation = q->corrupt_cor.rho;
1207 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1208 goto nla_put_failure;
1209
1210 if (q->rate >= (1ULL << 32)) {
1211 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1212 TCA_NETEM_PAD))
1213 goto nla_put_failure;
1214 rate.rate = ~0U;
1215 } else {
1216 rate.rate = q->rate;
1217 }
1218 rate.packet_overhead = q->packet_overhead;
1219 rate.cell_size = q->cell_size;
1220 rate.cell_overhead = q->cell_overhead;
1221 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1222 goto nla_put_failure;
1223
1224 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1225 goto nla_put_failure;
1226
1227 if (dump_loss_model(q, skb) != 0)
1228 goto nla_put_failure;
1229
1230 if (q->slot_config.min_delay | q->slot_config.max_delay |
1231 q->slot_config.dist_jitter) {
1232 slot = q->slot_config;
1233 if (slot.max_packets == INT_MAX)
1234 slot.max_packets = 0;
1235 if (slot.max_bytes == INT_MAX)
1236 slot.max_bytes = 0;
1237 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1238 goto nla_put_failure;
1239 }
1240
1241 if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed,
1242 TCA_NETEM_PAD))
1243 goto nla_put_failure;
1244
1245 return nla_nest_end(skb, nla);
1246
1247 nla_put_failure:
1248 nlmsg_trim(skb, nla);
1249 return -1;
1250 }
1251
netem_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)1252 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1253 struct sk_buff *skb, struct tcmsg *tcm)
1254 {
1255 struct netem_sched_data *q = qdisc_priv(sch);
1256
1257 if (cl != 1 || !q->qdisc) /* only one class */
1258 return -ENOENT;
1259
1260 tcm->tcm_handle |= TC_H_MIN(1);
1261 tcm->tcm_info = q->qdisc->handle;
1262
1263 return 0;
1264 }
1265
netem_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1266 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1267 struct Qdisc **old, struct netlink_ext_ack *extack)
1268 {
1269 struct netem_sched_data *q = qdisc_priv(sch);
1270
1271 *old = qdisc_replace(sch, new, &q->qdisc);
1272 return 0;
1273 }
1274
netem_leaf(struct Qdisc * sch,unsigned long arg)1275 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1276 {
1277 struct netem_sched_data *q = qdisc_priv(sch);
1278 return q->qdisc;
1279 }
1280
netem_find(struct Qdisc * sch,u32 classid)1281 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1282 {
1283 return 1;
1284 }
1285
netem_walk(struct Qdisc * sch,struct qdisc_walker * walker)1286 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1287 {
1288 if (!walker->stop) {
1289 if (!tc_qdisc_stats_dump(sch, 1, walker))
1290 return;
1291 }
1292 }
1293
1294 static const struct Qdisc_class_ops netem_class_ops = {
1295 .graft = netem_graft,
1296 .leaf = netem_leaf,
1297 .find = netem_find,
1298 .walk = netem_walk,
1299 .dump = netem_dump_class,
1300 };
1301
1302 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1303 .id = "netem",
1304 .cl_ops = &netem_class_ops,
1305 .priv_size = sizeof(struct netem_sched_data),
1306 .enqueue = netem_enqueue,
1307 .dequeue = netem_dequeue,
1308 .peek = qdisc_peek_dequeued,
1309 .init = netem_init,
1310 .reset = netem_reset,
1311 .destroy = netem_destroy,
1312 .change = netem_change,
1313 .dump = netem_dump,
1314 .owner = THIS_MODULE,
1315 };
1316
1317
netem_module_init(void)1318 static int __init netem_module_init(void)
1319 {
1320 pr_info("netem: version " VERSION "\n");
1321 return register_qdisc(&netem_qdisc_ops);
1322 }
netem_module_exit(void)1323 static void __exit netem_module_exit(void)
1324 {
1325 unregister_qdisc(&netem_qdisc_ops);
1326 }
1327 module_init(netem_module_init)
1328 module_exit(netem_module_exit)
1329 MODULE_LICENSE("GPL");
1330