1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_netem.c Network emulator 4 * 5 * Many of the algorithms and ideas for this came from 6 * NIST Net which is not copyrighted. 7 * 8 * Authors: Stephen Hemminger <shemminger@osdl.org> 9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/skbuff.h> 19 #include <linux/vmalloc.h> 20 #include <linux/rtnetlink.h> 21 #include <linux/reciprocal_div.h> 22 #include <linux/rbtree.h> 23 24 #include <net/gso.h> 25 #include <net/netlink.h> 26 #include <net/pkt_sched.h> 27 #include <net/inet_ecn.h> 28 29 #define VERSION "1.3" 30 31 /* Network Emulation Queuing algorithm. 32 ==================================== 33 34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 35 Network Emulation Tool 36 [2] Luigi Rizzo, DummyNet for FreeBSD 37 38 ---------------------------------------------------------------- 39 40 This started out as a simple way to delay outgoing packets to 41 test TCP but has grown to include most of the functionality 42 of a full blown network emulator like NISTnet. It can delay 43 packets and add random jitter (and correlation). The random 44 distribution can be loaded from a table as well to provide 45 normal, Pareto, or experimental curves. Packet loss, 46 duplication, and reordering can also be emulated. 47 48 This qdisc does not do classification that can be handled in 49 layering other disciplines. It does not need to do bandwidth 50 control either since that can be handled by using token 51 bucket or other rate control. 52 53 Correlated Loss Generator models 54 55 Added generation of correlated loss according to the 56 "Gilbert-Elliot" model, a 4-state markov model. 57 58 References: 59 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 60 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 61 and intuitive loss model for packet networks and its implementation 62 in the Netem module in the Linux kernel", available in [1] 63 64 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 65 Fabio Ludovici <fabio.ludovici at yahoo.it> 66 */ 67 68 struct disttable { 69 u32 size; 70 s16 table[]; 71 }; 72 73 struct netem_sched_data { 74 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 75 struct rb_root t_root; 76 77 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 78 struct sk_buff *t_head; 79 struct sk_buff *t_tail; 80 81 u32 t_len; 82 83 /* optional qdisc for classful handling (NULL at netem init) */ 84 struct Qdisc *qdisc; 85 86 struct qdisc_watchdog watchdog; 87 88 s64 latency; 89 s64 jitter; 90 91 u32 loss; 92 u32 ecn; 93 u32 limit; 94 u32 counter; 95 u32 gap; 96 u32 duplicate; 97 u32 reorder; 98 u32 corrupt; 99 u64 rate; 100 s32 packet_overhead; 101 u32 cell_size; 102 struct reciprocal_value cell_size_reciprocal; 103 s32 cell_overhead; 104 105 struct crndstate { 106 u32 last; 107 u32 rho; 108 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 109 110 struct prng { 111 u64 seed; 112 struct rnd_state prng_state; 113 } prng; 114 115 struct disttable *delay_dist; 116 117 enum { 118 CLG_RANDOM, 119 CLG_4_STATES, 120 CLG_GILB_ELL, 121 } loss_model; 122 123 enum { 124 TX_IN_GAP_PERIOD = 1, 125 TX_IN_BURST_PERIOD, 126 LOST_IN_GAP_PERIOD, 127 LOST_IN_BURST_PERIOD, 128 } _4_state_model; 129 130 enum { 131 GOOD_STATE = 1, 132 BAD_STATE, 133 } GE_state_model; 134 135 /* Correlated Loss Generation models */ 136 struct clgstate { 137 /* state of the Markov chain */ 138 u8 state; 139 140 /* 4-states and Gilbert-Elliot models */ 141 u32 a1; /* p13 for 4-states or p for GE */ 142 u32 a2; /* p31 for 4-states or r for GE */ 143 u32 a3; /* p32 for 4-states or h for GE */ 144 u32 a4; /* p14 for 4-states or 1-k for GE */ 145 u32 a5; /* p23 used only in 4-states */ 146 } clg; 147 148 struct tc_netem_slot slot_config; 149 struct slotstate { 150 u64 slot_next; 151 s32 packets_left; 152 s32 bytes_left; 153 } slot; 154 155 struct disttable *slot_dist; 156 }; 157 158 /* Time stamp put into socket buffer control block 159 * Only valid when skbs are in our internal t(ime)fifo queue. 160 * 161 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 162 * and skb->next & skb->prev are scratch space for a qdisc, 163 * we save skb->tstamp value in skb->cb[] before destroying it. 164 */ 165 struct netem_skb_cb { 166 u64 time_to_send; 167 }; 168 169 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 170 { 171 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 172 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 173 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 174 } 175 176 /* init_crandom - initialize correlated random number generator 177 * Use entropy source for initial seed. 178 */ 179 static void init_crandom(struct crndstate *state, unsigned long rho) 180 { 181 state->rho = rho; 182 state->last = get_random_u32(); 183 } 184 185 /* get_crandom - correlated random number generator 186 * Next number depends on last value. 187 * rho is scaled to avoid floating point. 188 */ 189 static u32 get_crandom(struct crndstate *state, struct prng *p) 190 { 191 u64 value, rho; 192 unsigned long answer; 193 struct rnd_state *s = &p->prng_state; 194 195 if (!state || state->rho == 0) /* no correlation */ 196 return prandom_u32_state(s); 197 198 value = prandom_u32_state(s); 199 rho = (u64)state->rho + 1; 200 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 201 state->last = answer; 202 return answer; 203 } 204 205 /* loss_4state - 4-state model loss generator 206 * Generates losses according to the 4-state Markov chain adopted in 207 * the GI (General and Intuitive) loss model. 208 */ 209 static bool loss_4state(struct netem_sched_data *q) 210 { 211 struct clgstate *clg = &q->clg; 212 u32 rnd = prandom_u32_state(&q->prng.prng_state); 213 214 /* 215 * Makes a comparison between rnd and the transition 216 * probabilities outgoing from the current state, then decides the 217 * next state and if the next packet has to be transmitted or lost. 218 * The four states correspond to: 219 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 220 * LOST_IN_GAP_PERIOD => isolated losses within a gap period 221 * LOST_IN_BURST_PERIOD => lost packets within a burst period 222 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period 223 */ 224 switch (clg->state) { 225 case TX_IN_GAP_PERIOD: 226 if (rnd < clg->a4) { 227 clg->state = LOST_IN_GAP_PERIOD; 228 return true; 229 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 230 clg->state = LOST_IN_BURST_PERIOD; 231 return true; 232 } else if (clg->a1 + clg->a4 < rnd) { 233 clg->state = TX_IN_GAP_PERIOD; 234 } 235 236 break; 237 case TX_IN_BURST_PERIOD: 238 if (rnd < clg->a5) { 239 clg->state = LOST_IN_BURST_PERIOD; 240 return true; 241 } else { 242 clg->state = TX_IN_BURST_PERIOD; 243 } 244 245 break; 246 case LOST_IN_BURST_PERIOD: 247 if (rnd < clg->a3) 248 clg->state = TX_IN_BURST_PERIOD; 249 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 250 clg->state = TX_IN_GAP_PERIOD; 251 } else if (clg->a2 + clg->a3 < rnd) { 252 clg->state = LOST_IN_BURST_PERIOD; 253 return true; 254 } 255 break; 256 case LOST_IN_GAP_PERIOD: 257 clg->state = TX_IN_GAP_PERIOD; 258 break; 259 } 260 261 return false; 262 } 263 264 /* loss_gilb_ell - Gilbert-Elliot model loss generator 265 * Generates losses according to the Gilbert-Elliot loss model or 266 * its special cases (Gilbert or Simple Gilbert) 267 * 268 * Makes a comparison between random number and the transition 269 * probabilities outgoing from the current state, then decides the 270 * next state. A second random number is extracted and the comparison 271 * with the loss probability of the current state decides if the next 272 * packet will be transmitted or lost. 273 */ 274 static bool loss_gilb_ell(struct netem_sched_data *q) 275 { 276 struct clgstate *clg = &q->clg; 277 struct rnd_state *s = &q->prng.prng_state; 278 279 switch (clg->state) { 280 case GOOD_STATE: 281 if (prandom_u32_state(s) < clg->a1) 282 clg->state = BAD_STATE; 283 if (prandom_u32_state(s) < clg->a4) 284 return true; 285 break; 286 case BAD_STATE: 287 if (prandom_u32_state(s) < clg->a2) 288 clg->state = GOOD_STATE; 289 if (prandom_u32_state(s) > clg->a3) 290 return true; 291 } 292 293 return false; 294 } 295 296 static bool loss_event(struct netem_sched_data *q) 297 { 298 switch (q->loss_model) { 299 case CLG_RANDOM: 300 /* Random packet drop 0 => none, ~0 => all */ 301 return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); 302 303 case CLG_4_STATES: 304 /* 4state loss model algorithm (used also for GI model) 305 * Extracts a value from the markov 4 state loss generator, 306 * if it is 1 drops a packet and if needed writes the event in 307 * the kernel logs 308 */ 309 return loss_4state(q); 310 311 case CLG_GILB_ELL: 312 /* Gilbert-Elliot loss model algorithm 313 * Extracts a value from the Gilbert-Elliot loss generator, 314 * if it is 1 drops a packet and if needed writes the event in 315 * the kernel logs 316 */ 317 return loss_gilb_ell(q); 318 } 319 320 return false; /* not reached */ 321 } 322 323 324 /* tabledist - return a pseudo-randomly distributed value with mean mu and 325 * std deviation sigma. Uses table lookup to approximate the desired 326 * distribution, and a uniformly-distributed pseudo-random source. 327 */ 328 static s64 tabledist(s64 mu, s32 sigma, 329 struct crndstate *state, 330 struct prng *prng, 331 const struct disttable *dist) 332 { 333 s64 x; 334 long t; 335 u32 rnd; 336 337 if (sigma == 0) 338 return mu; 339 340 rnd = get_crandom(state, prng); 341 342 /* default uniform distribution */ 343 if (dist == NULL) 344 return ((rnd % (2 * (u32)sigma)) + mu) - sigma; 345 346 t = dist->table[rnd % dist->size]; 347 x = (sigma % NETEM_DIST_SCALE) * t; 348 if (x >= 0) 349 x += NETEM_DIST_SCALE/2; 350 else 351 x -= NETEM_DIST_SCALE/2; 352 353 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 354 } 355 356 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 357 { 358 len += q->packet_overhead; 359 360 if (q->cell_size) { 361 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 362 363 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 364 cells++; 365 len = cells * (q->cell_size + q->cell_overhead); 366 } 367 368 return div64_u64(len * NSEC_PER_SEC, q->rate); 369 } 370 371 static void tfifo_reset(struct Qdisc *sch) 372 { 373 struct netem_sched_data *q = qdisc_priv(sch); 374 struct rb_node *p = rb_first(&q->t_root); 375 376 while (p) { 377 struct sk_buff *skb = rb_to_skb(p); 378 379 p = rb_next(p); 380 rb_erase(&skb->rbnode, &q->t_root); 381 rtnl_kfree_skbs(skb, skb); 382 } 383 384 rtnl_kfree_skbs(q->t_head, q->t_tail); 385 q->t_head = NULL; 386 q->t_tail = NULL; 387 q->t_len = 0; 388 } 389 390 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 391 { 392 struct netem_sched_data *q = qdisc_priv(sch); 393 u64 tnext = netem_skb_cb(nskb)->time_to_send; 394 395 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 396 if (q->t_tail) 397 q->t_tail->next = nskb; 398 else 399 q->t_head = nskb; 400 q->t_tail = nskb; 401 } else { 402 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 403 404 while (*p) { 405 struct sk_buff *skb; 406 407 parent = *p; 408 skb = rb_to_skb(parent); 409 if (tnext >= netem_skb_cb(skb)->time_to_send) 410 p = &parent->rb_right; 411 else 412 p = &parent->rb_left; 413 } 414 rb_link_node(&nskb->rbnode, parent, p); 415 rb_insert_color(&nskb->rbnode, &q->t_root); 416 } 417 q->t_len++; 418 sch->q.qlen++; 419 } 420 421 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 422 * when we statistically choose to corrupt one, we instead segment it, returning 423 * the first packet to be corrupted, and re-enqueue the remaining frames 424 */ 425 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 426 struct sk_buff **to_free) 427 { 428 struct sk_buff *segs; 429 netdev_features_t features = netif_skb_features(skb); 430 431 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 432 433 if (IS_ERR_OR_NULL(segs)) { 434 qdisc_drop(skb, sch, to_free); 435 return NULL; 436 } 437 consume_skb(skb); 438 return segs; 439 } 440 441 /* 442 * Insert one skb into qdisc. 443 * Note: parent depends on return value to account for queue length. 444 * NET_XMIT_DROP: queue length didn't change. 445 * NET_XMIT_SUCCESS: one skb was queued. 446 */ 447 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 448 struct sk_buff **to_free) 449 { 450 struct netem_sched_data *q = qdisc_priv(sch); 451 /* We don't fill cb now as skb_unshare() may invalidate it */ 452 struct netem_skb_cb *cb; 453 struct sk_buff *skb2 = NULL; 454 struct sk_buff *segs = NULL; 455 unsigned int prev_len = qdisc_pkt_len(skb); 456 int count = 1; 457 458 /* Do not fool qdisc_drop_all() */ 459 skb->prev = NULL; 460 461 /* Random duplication */ 462 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) 463 ++count; 464 465 /* Drop packet? */ 466 if (loss_event(q)) { 467 if (q->ecn && INET_ECN_set_ce(skb)) 468 qdisc_qstats_drop(sch); /* mark packet */ 469 else 470 --count; 471 } 472 if (count == 0) { 473 qdisc_qstats_drop(sch); 474 __qdisc_drop(skb, to_free); 475 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 476 } 477 478 /* If a delay is expected, orphan the skb. (orphaning usually takes 479 * place at TX completion time, so _before_ the link transit delay) 480 */ 481 if (q->latency || q->jitter || q->rate) 482 skb_orphan_partial(skb); 483 484 /* 485 * If we need to duplicate packet, then clone it before 486 * original is modified. 487 */ 488 if (count > 1) 489 skb2 = skb_clone(skb, GFP_ATOMIC); 490 491 /* 492 * Randomized packet corruption. 493 * Make copy if needed since we are modifying 494 * If packet is going to be hardware checksummed, then 495 * do it now in software before we mangle it. 496 */ 497 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { 498 if (skb_is_gso(skb)) { 499 skb = netem_segment(skb, sch, to_free); 500 if (!skb) 501 goto finish_segs; 502 503 segs = skb->next; 504 skb_mark_not_on_list(skb); 505 qdisc_skb_cb(skb)->pkt_len = skb->len; 506 } 507 508 skb = skb_unshare(skb, GFP_ATOMIC); 509 if (unlikely(!skb)) { 510 qdisc_qstats_drop(sch); 511 goto finish_segs; 512 } 513 if (skb->ip_summed == CHECKSUM_PARTIAL && 514 skb_checksum_help(skb)) { 515 qdisc_drop(skb, sch, to_free); 516 skb = NULL; 517 goto finish_segs; 518 } 519 520 skb->data[get_random_u32_below(skb_headlen(skb))] ^= 521 1<<get_random_u32_below(8); 522 } 523 524 if (unlikely(q->t_len >= sch->limit)) { 525 /* re-link segs, so that qdisc_drop_all() frees them all */ 526 skb->next = segs; 527 qdisc_drop_all(skb, sch, to_free); 528 if (skb2) 529 __qdisc_drop(skb2, to_free); 530 return NET_XMIT_DROP; 531 } 532 533 /* 534 * If doing duplication then re-insert at top of the 535 * qdisc tree, since parent queuer expects that only one 536 * skb will be queued. 537 */ 538 if (skb2) { 539 struct Qdisc *rootq = qdisc_root_bh(sch); 540 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 541 542 q->duplicate = 0; 543 rootq->enqueue(skb2, rootq, to_free); 544 q->duplicate = dupsave; 545 skb2 = NULL; 546 } 547 548 qdisc_qstats_backlog_inc(sch, skb); 549 550 cb = netem_skb_cb(skb); 551 if (q->gap == 0 || /* not doing reordering */ 552 q->counter < q->gap - 1 || /* inside last reordering gap */ 553 q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { 554 u64 now; 555 s64 delay; 556 557 delay = tabledist(q->latency, q->jitter, 558 &q->delay_cor, &q->prng, q->delay_dist); 559 560 now = ktime_get_ns(); 561 562 if (q->rate) { 563 struct netem_skb_cb *last = NULL; 564 565 if (sch->q.tail) 566 last = netem_skb_cb(sch->q.tail); 567 if (q->t_root.rb_node) { 568 struct sk_buff *t_skb; 569 struct netem_skb_cb *t_last; 570 571 t_skb = skb_rb_last(&q->t_root); 572 t_last = netem_skb_cb(t_skb); 573 if (!last || 574 t_last->time_to_send > last->time_to_send) 575 last = t_last; 576 } 577 if (q->t_tail) { 578 struct netem_skb_cb *t_last = 579 netem_skb_cb(q->t_tail); 580 581 if (!last || 582 t_last->time_to_send > last->time_to_send) 583 last = t_last; 584 } 585 586 if (last) { 587 /* 588 * Last packet in queue is reference point (now), 589 * calculate this time bonus and subtract 590 * from delay. 591 */ 592 delay -= last->time_to_send - now; 593 delay = max_t(s64, 0, delay); 594 now = last->time_to_send; 595 } 596 597 delay += packet_time_ns(qdisc_pkt_len(skb), q); 598 } 599 600 cb->time_to_send = now + delay; 601 ++q->counter; 602 tfifo_enqueue(skb, sch); 603 } else { 604 /* 605 * Do re-ordering by putting one out of N packets at the front 606 * of the queue. 607 */ 608 cb->time_to_send = ktime_get_ns(); 609 q->counter = 0; 610 611 __qdisc_enqueue_head(skb, &sch->q); 612 sch->qstats.requeues++; 613 } 614 615 finish_segs: 616 if (skb2) 617 __qdisc_drop(skb2, to_free); 618 619 if (segs) { 620 unsigned int len, last_len; 621 int rc, nb; 622 623 len = skb ? skb->len : 0; 624 nb = skb ? 1 : 0; 625 626 while (segs) { 627 skb2 = segs->next; 628 skb_mark_not_on_list(segs); 629 qdisc_skb_cb(segs)->pkt_len = segs->len; 630 last_len = segs->len; 631 rc = qdisc_enqueue(segs, sch, to_free); 632 if (rc != NET_XMIT_SUCCESS) { 633 if (net_xmit_drop_count(rc)) 634 qdisc_qstats_drop(sch); 635 } else { 636 nb++; 637 len += last_len; 638 } 639 segs = skb2; 640 } 641 /* Parent qdiscs accounted for 1 skb of size @prev_len */ 642 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); 643 } else if (!skb) { 644 return NET_XMIT_DROP; 645 } 646 return NET_XMIT_SUCCESS; 647 } 648 649 /* Delay the next round with a new future slot with a 650 * correct number of bytes and packets. 651 */ 652 653 static void get_slot_next(struct netem_sched_data *q, u64 now) 654 { 655 s64 next_delay; 656 657 if (!q->slot_dist) 658 next_delay = q->slot_config.min_delay + 659 (get_random_u32() * 660 (q->slot_config.max_delay - 661 q->slot_config.min_delay) >> 32); 662 else 663 next_delay = tabledist(q->slot_config.dist_delay, 664 (s32)(q->slot_config.dist_jitter), 665 NULL, &q->prng, q->slot_dist); 666 667 q->slot.slot_next = now + next_delay; 668 q->slot.packets_left = q->slot_config.max_packets; 669 q->slot.bytes_left = q->slot_config.max_bytes; 670 } 671 672 static struct sk_buff *netem_peek(struct netem_sched_data *q) 673 { 674 struct sk_buff *skb = skb_rb_first(&q->t_root); 675 u64 t1, t2; 676 677 if (!skb) 678 return q->t_head; 679 if (!q->t_head) 680 return skb; 681 682 t1 = netem_skb_cb(skb)->time_to_send; 683 t2 = netem_skb_cb(q->t_head)->time_to_send; 684 if (t1 < t2) 685 return skb; 686 return q->t_head; 687 } 688 689 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 690 { 691 if (skb == q->t_head) { 692 q->t_head = skb->next; 693 if (!q->t_head) 694 q->t_tail = NULL; 695 } else { 696 rb_erase(&skb->rbnode, &q->t_root); 697 } 698 } 699 700 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 701 { 702 struct netem_sched_data *q = qdisc_priv(sch); 703 struct sk_buff *skb; 704 705 tfifo_dequeue: 706 skb = __qdisc_dequeue_head(&sch->q); 707 if (skb) { 708 deliver: 709 qdisc_qstats_backlog_dec(sch, skb); 710 qdisc_bstats_update(sch, skb); 711 return skb; 712 } 713 skb = netem_peek(q); 714 if (skb) { 715 u64 time_to_send; 716 u64 now = ktime_get_ns(); 717 718 /* if more time remaining? */ 719 time_to_send = netem_skb_cb(skb)->time_to_send; 720 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 721 get_slot_next(q, now); 722 723 if (time_to_send <= now && q->slot.slot_next <= now) { 724 netem_erase_head(q, skb); 725 q->t_len--; 726 skb->next = NULL; 727 skb->prev = NULL; 728 /* skb->dev shares skb->rbnode area, 729 * we need to restore its value. 730 */ 731 skb->dev = qdisc_dev(sch); 732 733 if (q->slot.slot_next) { 734 q->slot.packets_left--; 735 q->slot.bytes_left -= qdisc_pkt_len(skb); 736 if (q->slot.packets_left <= 0 || 737 q->slot.bytes_left <= 0) 738 get_slot_next(q, now); 739 } 740 741 if (q->qdisc) { 742 unsigned int pkt_len = qdisc_pkt_len(skb); 743 struct sk_buff *to_free = NULL; 744 int err; 745 746 err = qdisc_enqueue(skb, q->qdisc, &to_free); 747 kfree_skb_list(to_free); 748 if (err != NET_XMIT_SUCCESS) { 749 if (net_xmit_drop_count(err)) 750 qdisc_qstats_drop(sch); 751 qdisc_tree_reduce_backlog(sch, 1, pkt_len); 752 sch->qstats.backlog -= pkt_len; 753 sch->q.qlen--; 754 } 755 goto tfifo_dequeue; 756 } 757 sch->q.qlen--; 758 goto deliver; 759 } 760 761 if (q->qdisc) { 762 skb = q->qdisc->ops->dequeue(q->qdisc); 763 if (skb) { 764 sch->q.qlen--; 765 goto deliver; 766 } 767 } 768 769 qdisc_watchdog_schedule_ns(&q->watchdog, 770 max(time_to_send, 771 q->slot.slot_next)); 772 } 773 774 if (q->qdisc) { 775 skb = q->qdisc->ops->dequeue(q->qdisc); 776 if (skb) { 777 sch->q.qlen--; 778 goto deliver; 779 } 780 } 781 return NULL; 782 } 783 784 static void netem_reset(struct Qdisc *sch) 785 { 786 struct netem_sched_data *q = qdisc_priv(sch); 787 788 qdisc_reset_queue(sch); 789 tfifo_reset(sch); 790 if (q->qdisc) 791 qdisc_reset(q->qdisc); 792 qdisc_watchdog_cancel(&q->watchdog); 793 } 794 795 static void dist_free(struct disttable *d) 796 { 797 kvfree(d); 798 } 799 800 /* 801 * Distribution data is a variable size payload containing 802 * signed 16 bit values. 803 */ 804 805 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) 806 { 807 size_t n = nla_len(attr)/sizeof(__s16); 808 const __s16 *data = nla_data(attr); 809 struct disttable *d; 810 int i; 811 812 if (!n || n > NETEM_DIST_MAX) 813 return -EINVAL; 814 815 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); 816 if (!d) 817 return -ENOMEM; 818 819 d->size = n; 820 for (i = 0; i < n; i++) 821 d->table[i] = data[i]; 822 823 *tbl = d; 824 return 0; 825 } 826 827 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 828 { 829 const struct tc_netem_slot *c = nla_data(attr); 830 831 q->slot_config = *c; 832 if (q->slot_config.max_packets == 0) 833 q->slot_config.max_packets = INT_MAX; 834 if (q->slot_config.max_bytes == 0) 835 q->slot_config.max_bytes = INT_MAX; 836 837 /* capping dist_jitter to the range acceptable by tabledist() */ 838 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); 839 840 q->slot.packets_left = q->slot_config.max_packets; 841 q->slot.bytes_left = q->slot_config.max_bytes; 842 if (q->slot_config.min_delay | q->slot_config.max_delay | 843 q->slot_config.dist_jitter) 844 q->slot.slot_next = ktime_get_ns(); 845 else 846 q->slot.slot_next = 0; 847 } 848 849 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 850 { 851 const struct tc_netem_corr *c = nla_data(attr); 852 853 init_crandom(&q->delay_cor, c->delay_corr); 854 init_crandom(&q->loss_cor, c->loss_corr); 855 init_crandom(&q->dup_cor, c->dup_corr); 856 } 857 858 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 859 { 860 const struct tc_netem_reorder *r = nla_data(attr); 861 862 q->reorder = r->probability; 863 init_crandom(&q->reorder_cor, r->correlation); 864 } 865 866 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 867 { 868 const struct tc_netem_corrupt *r = nla_data(attr); 869 870 q->corrupt = r->probability; 871 init_crandom(&q->corrupt_cor, r->correlation); 872 } 873 874 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 875 { 876 const struct tc_netem_rate *r = nla_data(attr); 877 878 q->rate = r->rate; 879 q->packet_overhead = r->packet_overhead; 880 q->cell_size = r->cell_size; 881 q->cell_overhead = r->cell_overhead; 882 if (q->cell_size) 883 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 884 else 885 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 886 } 887 888 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 889 { 890 const struct nlattr *la; 891 int rem; 892 893 nla_for_each_nested(la, attr, rem) { 894 u16 type = nla_type(la); 895 896 switch (type) { 897 case NETEM_LOSS_GI: { 898 const struct tc_netem_gimodel *gi = nla_data(la); 899 900 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 901 pr_info("netem: incorrect gi model size\n"); 902 return -EINVAL; 903 } 904 905 q->loss_model = CLG_4_STATES; 906 907 q->clg.state = TX_IN_GAP_PERIOD; 908 q->clg.a1 = gi->p13; 909 q->clg.a2 = gi->p31; 910 q->clg.a3 = gi->p32; 911 q->clg.a4 = gi->p14; 912 q->clg.a5 = gi->p23; 913 break; 914 } 915 916 case NETEM_LOSS_GE: { 917 const struct tc_netem_gemodel *ge = nla_data(la); 918 919 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 920 pr_info("netem: incorrect ge model size\n"); 921 return -EINVAL; 922 } 923 924 q->loss_model = CLG_GILB_ELL; 925 q->clg.state = GOOD_STATE; 926 q->clg.a1 = ge->p; 927 q->clg.a2 = ge->r; 928 q->clg.a3 = ge->h; 929 q->clg.a4 = ge->k1; 930 break; 931 } 932 933 default: 934 pr_info("netem: unknown loss type %u\n", type); 935 return -EINVAL; 936 } 937 } 938 939 return 0; 940 } 941 942 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 943 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 944 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 945 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 946 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 947 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 948 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 949 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 950 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 951 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 952 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 953 [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, 954 }; 955 956 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 957 const struct nla_policy *policy, int len) 958 { 959 int nested_len = nla_len(nla) - NLA_ALIGN(len); 960 961 if (nested_len < 0) { 962 pr_info("netem: invalid attributes len %d\n", nested_len); 963 return -EINVAL; 964 } 965 966 if (nested_len >= nla_attr_size(0)) 967 return nla_parse_deprecated(tb, maxtype, 968 nla_data(nla) + NLA_ALIGN(len), 969 nested_len, policy, NULL); 970 971 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 972 return 0; 973 } 974 975 /* Parse netlink message to set options */ 976 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 977 struct netlink_ext_ack *extack) 978 { 979 struct netem_sched_data *q = qdisc_priv(sch); 980 struct nlattr *tb[TCA_NETEM_MAX + 1]; 981 struct disttable *delay_dist = NULL; 982 struct disttable *slot_dist = NULL; 983 struct tc_netem_qopt *qopt; 984 struct clgstate old_clg; 985 int old_loss_model = CLG_RANDOM; 986 int ret; 987 988 qopt = nla_data(opt); 989 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 990 if (ret < 0) 991 return ret; 992 993 if (tb[TCA_NETEM_DELAY_DIST]) { 994 ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); 995 if (ret) 996 goto table_free; 997 } 998 999 if (tb[TCA_NETEM_SLOT_DIST]) { 1000 ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); 1001 if (ret) 1002 goto table_free; 1003 } 1004 1005 sch_tree_lock(sch); 1006 /* backup q->clg and q->loss_model */ 1007 old_clg = q->clg; 1008 old_loss_model = q->loss_model; 1009 1010 if (tb[TCA_NETEM_LOSS]) { 1011 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 1012 if (ret) { 1013 q->loss_model = old_loss_model; 1014 q->clg = old_clg; 1015 goto unlock; 1016 } 1017 } else { 1018 q->loss_model = CLG_RANDOM; 1019 } 1020 1021 if (delay_dist) 1022 swap(q->delay_dist, delay_dist); 1023 if (slot_dist) 1024 swap(q->slot_dist, slot_dist); 1025 sch->limit = qopt->limit; 1026 1027 q->latency = PSCHED_TICKS2NS(qopt->latency); 1028 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 1029 q->limit = qopt->limit; 1030 q->gap = qopt->gap; 1031 q->counter = 0; 1032 q->loss = qopt->loss; 1033 q->duplicate = qopt->duplicate; 1034 1035 /* for compatibility with earlier versions. 1036 * if gap is set, need to assume 100% probability 1037 */ 1038 if (q->gap) 1039 q->reorder = ~0; 1040 1041 if (tb[TCA_NETEM_CORR]) 1042 get_correlation(q, tb[TCA_NETEM_CORR]); 1043 1044 if (tb[TCA_NETEM_REORDER]) 1045 get_reorder(q, tb[TCA_NETEM_REORDER]); 1046 1047 if (tb[TCA_NETEM_CORRUPT]) 1048 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1049 1050 if (tb[TCA_NETEM_RATE]) 1051 get_rate(q, tb[TCA_NETEM_RATE]); 1052 1053 if (tb[TCA_NETEM_RATE64]) 1054 q->rate = max_t(u64, q->rate, 1055 nla_get_u64(tb[TCA_NETEM_RATE64])); 1056 1057 if (tb[TCA_NETEM_LATENCY64]) 1058 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1059 1060 if (tb[TCA_NETEM_JITTER64]) 1061 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1062 1063 if (tb[TCA_NETEM_ECN]) 1064 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1065 1066 if (tb[TCA_NETEM_SLOT]) 1067 get_slot(q, tb[TCA_NETEM_SLOT]); 1068 1069 /* capping jitter to the range acceptable by tabledist() */ 1070 q->jitter = min_t(s64, abs(q->jitter), INT_MAX); 1071 1072 if (tb[TCA_NETEM_PRNG_SEED]) 1073 q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); 1074 else 1075 q->prng.seed = get_random_u64(); 1076 prandom_seed_state(&q->prng.prng_state, q->prng.seed); 1077 1078 unlock: 1079 sch_tree_unlock(sch); 1080 1081 table_free: 1082 dist_free(delay_dist); 1083 dist_free(slot_dist); 1084 return ret; 1085 } 1086 1087 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1088 struct netlink_ext_ack *extack) 1089 { 1090 struct netem_sched_data *q = qdisc_priv(sch); 1091 int ret; 1092 1093 qdisc_watchdog_init(&q->watchdog, sch); 1094 1095 if (!opt) 1096 return -EINVAL; 1097 1098 q->loss_model = CLG_RANDOM; 1099 ret = netem_change(sch, opt, extack); 1100 if (ret) 1101 pr_info("netem: change failed\n"); 1102 return ret; 1103 } 1104 1105 static void netem_destroy(struct Qdisc *sch) 1106 { 1107 struct netem_sched_data *q = qdisc_priv(sch); 1108 1109 qdisc_watchdog_cancel(&q->watchdog); 1110 if (q->qdisc) 1111 qdisc_put(q->qdisc); 1112 dist_free(q->delay_dist); 1113 dist_free(q->slot_dist); 1114 } 1115 1116 static int dump_loss_model(const struct netem_sched_data *q, 1117 struct sk_buff *skb) 1118 { 1119 struct nlattr *nest; 1120 1121 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); 1122 if (nest == NULL) 1123 goto nla_put_failure; 1124 1125 switch (q->loss_model) { 1126 case CLG_RANDOM: 1127 /* legacy loss model */ 1128 nla_nest_cancel(skb, nest); 1129 return 0; /* no data */ 1130 1131 case CLG_4_STATES: { 1132 struct tc_netem_gimodel gi = { 1133 .p13 = q->clg.a1, 1134 .p31 = q->clg.a2, 1135 .p32 = q->clg.a3, 1136 .p14 = q->clg.a4, 1137 .p23 = q->clg.a5, 1138 }; 1139 1140 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1141 goto nla_put_failure; 1142 break; 1143 } 1144 case CLG_GILB_ELL: { 1145 struct tc_netem_gemodel ge = { 1146 .p = q->clg.a1, 1147 .r = q->clg.a2, 1148 .h = q->clg.a3, 1149 .k1 = q->clg.a4, 1150 }; 1151 1152 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1153 goto nla_put_failure; 1154 break; 1155 } 1156 } 1157 1158 nla_nest_end(skb, nest); 1159 return 0; 1160 1161 nla_put_failure: 1162 nla_nest_cancel(skb, nest); 1163 return -1; 1164 } 1165 1166 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1167 { 1168 const struct netem_sched_data *q = qdisc_priv(sch); 1169 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1170 struct tc_netem_qopt qopt; 1171 struct tc_netem_corr cor; 1172 struct tc_netem_reorder reorder; 1173 struct tc_netem_corrupt corrupt; 1174 struct tc_netem_rate rate; 1175 struct tc_netem_slot slot; 1176 1177 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), 1178 UINT_MAX); 1179 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), 1180 UINT_MAX); 1181 qopt.limit = q->limit; 1182 qopt.loss = q->loss; 1183 qopt.gap = q->gap; 1184 qopt.duplicate = q->duplicate; 1185 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1186 goto nla_put_failure; 1187 1188 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1189 goto nla_put_failure; 1190 1191 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1192 goto nla_put_failure; 1193 1194 cor.delay_corr = q->delay_cor.rho; 1195 cor.loss_corr = q->loss_cor.rho; 1196 cor.dup_corr = q->dup_cor.rho; 1197 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1198 goto nla_put_failure; 1199 1200 reorder.probability = q->reorder; 1201 reorder.correlation = q->reorder_cor.rho; 1202 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1203 goto nla_put_failure; 1204 1205 corrupt.probability = q->corrupt; 1206 corrupt.correlation = q->corrupt_cor.rho; 1207 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1208 goto nla_put_failure; 1209 1210 if (q->rate >= (1ULL << 32)) { 1211 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1212 TCA_NETEM_PAD)) 1213 goto nla_put_failure; 1214 rate.rate = ~0U; 1215 } else { 1216 rate.rate = q->rate; 1217 } 1218 rate.packet_overhead = q->packet_overhead; 1219 rate.cell_size = q->cell_size; 1220 rate.cell_overhead = q->cell_overhead; 1221 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1222 goto nla_put_failure; 1223 1224 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1225 goto nla_put_failure; 1226 1227 if (dump_loss_model(q, skb) != 0) 1228 goto nla_put_failure; 1229 1230 if (q->slot_config.min_delay | q->slot_config.max_delay | 1231 q->slot_config.dist_jitter) { 1232 slot = q->slot_config; 1233 if (slot.max_packets == INT_MAX) 1234 slot.max_packets = 0; 1235 if (slot.max_bytes == INT_MAX) 1236 slot.max_bytes = 0; 1237 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1238 goto nla_put_failure; 1239 } 1240 1241 if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, 1242 TCA_NETEM_PAD)) 1243 goto nla_put_failure; 1244 1245 return nla_nest_end(skb, nla); 1246 1247 nla_put_failure: 1248 nlmsg_trim(skb, nla); 1249 return -1; 1250 } 1251 1252 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1253 struct sk_buff *skb, struct tcmsg *tcm) 1254 { 1255 struct netem_sched_data *q = qdisc_priv(sch); 1256 1257 if (cl != 1 || !q->qdisc) /* only one class */ 1258 return -ENOENT; 1259 1260 tcm->tcm_handle |= TC_H_MIN(1); 1261 tcm->tcm_info = q->qdisc->handle; 1262 1263 return 0; 1264 } 1265 1266 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1267 struct Qdisc **old, struct netlink_ext_ack *extack) 1268 { 1269 struct netem_sched_data *q = qdisc_priv(sch); 1270 1271 *old = qdisc_replace(sch, new, &q->qdisc); 1272 return 0; 1273 } 1274 1275 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1276 { 1277 struct netem_sched_data *q = qdisc_priv(sch); 1278 return q->qdisc; 1279 } 1280 1281 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1282 { 1283 return 1; 1284 } 1285 1286 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1287 { 1288 if (!walker->stop) { 1289 if (!tc_qdisc_stats_dump(sch, 1, walker)) 1290 return; 1291 } 1292 } 1293 1294 static const struct Qdisc_class_ops netem_class_ops = { 1295 .graft = netem_graft, 1296 .leaf = netem_leaf, 1297 .find = netem_find, 1298 .walk = netem_walk, 1299 .dump = netem_dump_class, 1300 }; 1301 1302 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1303 .id = "netem", 1304 .cl_ops = &netem_class_ops, 1305 .priv_size = sizeof(struct netem_sched_data), 1306 .enqueue = netem_enqueue, 1307 .dequeue = netem_dequeue, 1308 .peek = qdisc_peek_dequeued, 1309 .init = netem_init, 1310 .reset = netem_reset, 1311 .destroy = netem_destroy, 1312 .change = netem_change, 1313 .dump = netem_dump, 1314 .owner = THIS_MODULE, 1315 }; 1316 1317 1318 static int __init netem_module_init(void) 1319 { 1320 pr_info("netem: version " VERSION "\n"); 1321 return register_qdisc(&netem_qdisc_ops); 1322 } 1323 static void __exit netem_module_exit(void) 1324 { 1325 unregister_qdisc(&netem_qdisc_ops); 1326 } 1327 module_init(netem_module_init) 1328 module_exit(netem_module_exit) 1329 MODULE_LICENSE("GPL"); 1330