1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_netem.c Network emulator 4 * 5 * Many of the algorithms and ideas for this came from 6 * NIST Net which is not copyrighted. 7 * 8 * Authors: Stephen Hemminger <shemminger@osdl.org> 9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/skbuff.h> 19 #include <linux/vmalloc.h> 20 #include <linux/rtnetlink.h> 21 #include <linux/reciprocal_div.h> 22 #include <linux/rbtree.h> 23 24 #include <net/gso.h> 25 #include <net/netlink.h> 26 #include <net/pkt_sched.h> 27 #include <net/inet_ecn.h> 28 29 #define VERSION "1.3" 30 31 /* Network Emulation Queuing algorithm. 32 ==================================== 33 34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 35 Network Emulation Tool 36 [2] Luigi Rizzo, DummyNet for FreeBSD 37 38 ---------------------------------------------------------------- 39 40 This started out as a simple way to delay outgoing packets to 41 test TCP but has grown to include most of the functionality 42 of a full blown network emulator like NISTnet. It can delay 43 packets and add random jitter (and correlation). The random 44 distribution can be loaded from a table as well to provide 45 normal, Pareto, or experimental curves. Packet loss, 46 duplication, and reordering can also be emulated. 47 48 This qdisc does not do classification that can be handled in 49 layering other disciplines. It does not need to do bandwidth 50 control either since that can be handled by using token 51 bucket or other rate control. 52 53 Correlated Loss Generator models 54 55 Added generation of correlated loss according to the 56 "Gilbert-Elliot" model, a 4-state markov model. 57 58 References: 59 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 60 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 61 and intuitive loss model for packet networks and its implementation 62 in the Netem module in the Linux kernel", available in [1] 63 64 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 65 Fabio Ludovici <fabio.ludovici at yahoo.it> 66 */ 67 68 struct disttable { 69 u32 size; 70 s16 table[]; 71 }; 72 73 struct netem_sched_data { 74 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 75 struct rb_root t_root; 76 77 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 78 struct sk_buff *t_head; 79 struct sk_buff *t_tail; 80 81 /* optional qdisc for classful handling (NULL at netem init) */ 82 struct Qdisc *qdisc; 83 84 struct qdisc_watchdog watchdog; 85 86 s64 latency; 87 s64 jitter; 88 89 u32 loss; 90 u32 ecn; 91 u32 limit; 92 u32 counter; 93 u32 gap; 94 u32 duplicate; 95 u32 reorder; 96 u32 corrupt; 97 u64 rate; 98 s32 packet_overhead; 99 u32 cell_size; 100 struct reciprocal_value cell_size_reciprocal; 101 s32 cell_overhead; 102 103 struct crndstate { 104 u32 last; 105 u32 rho; 106 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 107 108 struct disttable *delay_dist; 109 110 enum { 111 CLG_RANDOM, 112 CLG_4_STATES, 113 CLG_GILB_ELL, 114 } loss_model; 115 116 enum { 117 TX_IN_GAP_PERIOD = 1, 118 TX_IN_BURST_PERIOD, 119 LOST_IN_GAP_PERIOD, 120 LOST_IN_BURST_PERIOD, 121 } _4_state_model; 122 123 enum { 124 GOOD_STATE = 1, 125 BAD_STATE, 126 } GE_state_model; 127 128 /* Correlated Loss Generation models */ 129 struct clgstate { 130 /* state of the Markov chain */ 131 u8 state; 132 133 /* 4-states and Gilbert-Elliot models */ 134 u32 a1; /* p13 for 4-states or p for GE */ 135 u32 a2; /* p31 for 4-states or r for GE */ 136 u32 a3; /* p32 for 4-states or h for GE */ 137 u32 a4; /* p14 for 4-states or 1-k for GE */ 138 u32 a5; /* p23 used only in 4-states */ 139 } clg; 140 141 struct tc_netem_slot slot_config; 142 struct slotstate { 143 u64 slot_next; 144 s32 packets_left; 145 s32 bytes_left; 146 } slot; 147 148 struct disttable *slot_dist; 149 }; 150 151 /* Time stamp put into socket buffer control block 152 * Only valid when skbs are in our internal t(ime)fifo queue. 153 * 154 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 155 * and skb->next & skb->prev are scratch space for a qdisc, 156 * we save skb->tstamp value in skb->cb[] before destroying it. 157 */ 158 struct netem_skb_cb { 159 u64 time_to_send; 160 }; 161 162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 163 { 164 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 165 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 166 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 167 } 168 169 /* init_crandom - initialize correlated random number generator 170 * Use entropy source for initial seed. 171 */ 172 static void init_crandom(struct crndstate *state, unsigned long rho) 173 { 174 state->rho = rho; 175 state->last = get_random_u32(); 176 } 177 178 /* get_crandom - correlated random number generator 179 * Next number depends on last value. 180 * rho is scaled to avoid floating point. 181 */ 182 static u32 get_crandom(struct crndstate *state) 183 { 184 u64 value, rho; 185 unsigned long answer; 186 187 if (!state || state->rho == 0) /* no correlation */ 188 return get_random_u32(); 189 190 value = get_random_u32(); 191 rho = (u64)state->rho + 1; 192 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 193 state->last = answer; 194 return answer; 195 } 196 197 /* loss_4state - 4-state model loss generator 198 * Generates losses according to the 4-state Markov chain adopted in 199 * the GI (General and Intuitive) loss model. 200 */ 201 static bool loss_4state(struct netem_sched_data *q) 202 { 203 struct clgstate *clg = &q->clg; 204 u32 rnd = get_random_u32(); 205 206 /* 207 * Makes a comparison between rnd and the transition 208 * probabilities outgoing from the current state, then decides the 209 * next state and if the next packet has to be transmitted or lost. 210 * The four states correspond to: 211 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 212 * LOST_IN_GAP_PERIOD => isolated losses within a gap period 213 * LOST_IN_BURST_PERIOD => lost packets within a burst period 214 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period 215 */ 216 switch (clg->state) { 217 case TX_IN_GAP_PERIOD: 218 if (rnd < clg->a4) { 219 clg->state = LOST_IN_GAP_PERIOD; 220 return true; 221 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 222 clg->state = LOST_IN_BURST_PERIOD; 223 return true; 224 } else if (clg->a1 + clg->a4 < rnd) { 225 clg->state = TX_IN_GAP_PERIOD; 226 } 227 228 break; 229 case TX_IN_BURST_PERIOD: 230 if (rnd < clg->a5) { 231 clg->state = LOST_IN_BURST_PERIOD; 232 return true; 233 } else { 234 clg->state = TX_IN_BURST_PERIOD; 235 } 236 237 break; 238 case LOST_IN_BURST_PERIOD: 239 if (rnd < clg->a3) 240 clg->state = TX_IN_BURST_PERIOD; 241 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 242 clg->state = TX_IN_GAP_PERIOD; 243 } else if (clg->a2 + clg->a3 < rnd) { 244 clg->state = LOST_IN_BURST_PERIOD; 245 return true; 246 } 247 break; 248 case LOST_IN_GAP_PERIOD: 249 clg->state = TX_IN_GAP_PERIOD; 250 break; 251 } 252 253 return false; 254 } 255 256 /* loss_gilb_ell - Gilbert-Elliot model loss generator 257 * Generates losses according to the Gilbert-Elliot loss model or 258 * its special cases (Gilbert or Simple Gilbert) 259 * 260 * Makes a comparison between random number and the transition 261 * probabilities outgoing from the current state, then decides the 262 * next state. A second random number is extracted and the comparison 263 * with the loss probability of the current state decides if the next 264 * packet will be transmitted or lost. 265 */ 266 static bool loss_gilb_ell(struct netem_sched_data *q) 267 { 268 struct clgstate *clg = &q->clg; 269 270 switch (clg->state) { 271 case GOOD_STATE: 272 if (get_random_u32() < clg->a1) 273 clg->state = BAD_STATE; 274 if (get_random_u32() < clg->a4) 275 return true; 276 break; 277 case BAD_STATE: 278 if (get_random_u32() < clg->a2) 279 clg->state = GOOD_STATE; 280 if (get_random_u32() > clg->a3) 281 return true; 282 } 283 284 return false; 285 } 286 287 static bool loss_event(struct netem_sched_data *q) 288 { 289 switch (q->loss_model) { 290 case CLG_RANDOM: 291 /* Random packet drop 0 => none, ~0 => all */ 292 return q->loss && q->loss >= get_crandom(&q->loss_cor); 293 294 case CLG_4_STATES: 295 /* 4state loss model algorithm (used also for GI model) 296 * Extracts a value from the markov 4 state loss generator, 297 * if it is 1 drops a packet and if needed writes the event in 298 * the kernel logs 299 */ 300 return loss_4state(q); 301 302 case CLG_GILB_ELL: 303 /* Gilbert-Elliot loss model algorithm 304 * Extracts a value from the Gilbert-Elliot loss generator, 305 * if it is 1 drops a packet and if needed writes the event in 306 * the kernel logs 307 */ 308 return loss_gilb_ell(q); 309 } 310 311 return false; /* not reached */ 312 } 313 314 315 /* tabledist - return a pseudo-randomly distributed value with mean mu and 316 * std deviation sigma. Uses table lookup to approximate the desired 317 * distribution, and a uniformly-distributed pseudo-random source. 318 */ 319 static s64 tabledist(s64 mu, s32 sigma, 320 struct crndstate *state, 321 const struct disttable *dist) 322 { 323 s64 x; 324 long t; 325 u32 rnd; 326 327 if (sigma == 0) 328 return mu; 329 330 rnd = get_crandom(state); 331 332 /* default uniform distribution */ 333 if (dist == NULL) 334 return ((rnd % (2 * (u32)sigma)) + mu) - sigma; 335 336 t = dist->table[rnd % dist->size]; 337 x = (sigma % NETEM_DIST_SCALE) * t; 338 if (x >= 0) 339 x += NETEM_DIST_SCALE/2; 340 else 341 x -= NETEM_DIST_SCALE/2; 342 343 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 344 } 345 346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 347 { 348 len += q->packet_overhead; 349 350 if (q->cell_size) { 351 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 352 353 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 354 cells++; 355 len = cells * (q->cell_size + q->cell_overhead); 356 } 357 358 return div64_u64(len * NSEC_PER_SEC, q->rate); 359 } 360 361 static void tfifo_reset(struct Qdisc *sch) 362 { 363 struct netem_sched_data *q = qdisc_priv(sch); 364 struct rb_node *p = rb_first(&q->t_root); 365 366 while (p) { 367 struct sk_buff *skb = rb_to_skb(p); 368 369 p = rb_next(p); 370 rb_erase(&skb->rbnode, &q->t_root); 371 rtnl_kfree_skbs(skb, skb); 372 } 373 374 rtnl_kfree_skbs(q->t_head, q->t_tail); 375 q->t_head = NULL; 376 q->t_tail = NULL; 377 } 378 379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 380 { 381 struct netem_sched_data *q = qdisc_priv(sch); 382 u64 tnext = netem_skb_cb(nskb)->time_to_send; 383 384 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 385 if (q->t_tail) 386 q->t_tail->next = nskb; 387 else 388 q->t_head = nskb; 389 q->t_tail = nskb; 390 } else { 391 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 392 393 while (*p) { 394 struct sk_buff *skb; 395 396 parent = *p; 397 skb = rb_to_skb(parent); 398 if (tnext >= netem_skb_cb(skb)->time_to_send) 399 p = &parent->rb_right; 400 else 401 p = &parent->rb_left; 402 } 403 rb_link_node(&nskb->rbnode, parent, p); 404 rb_insert_color(&nskb->rbnode, &q->t_root); 405 } 406 sch->q.qlen++; 407 } 408 409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 410 * when we statistically choose to corrupt one, we instead segment it, returning 411 * the first packet to be corrupted, and re-enqueue the remaining frames 412 */ 413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 414 struct sk_buff **to_free) 415 { 416 struct sk_buff *segs; 417 netdev_features_t features = netif_skb_features(skb); 418 419 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 420 421 if (IS_ERR_OR_NULL(segs)) { 422 qdisc_drop(skb, sch, to_free); 423 return NULL; 424 } 425 consume_skb(skb); 426 return segs; 427 } 428 429 /* 430 * Insert one skb into qdisc. 431 * Note: parent depends on return value to account for queue length. 432 * NET_XMIT_DROP: queue length didn't change. 433 * NET_XMIT_SUCCESS: one skb was queued. 434 */ 435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 436 struct sk_buff **to_free) 437 { 438 struct netem_sched_data *q = qdisc_priv(sch); 439 /* We don't fill cb now as skb_unshare() may invalidate it */ 440 struct netem_skb_cb *cb; 441 struct sk_buff *skb2; 442 struct sk_buff *segs = NULL; 443 unsigned int prev_len = qdisc_pkt_len(skb); 444 int count = 1; 445 int rc = NET_XMIT_SUCCESS; 446 int rc_drop = NET_XMIT_DROP; 447 448 /* Do not fool qdisc_drop_all() */ 449 skb->prev = NULL; 450 451 /* Random duplication */ 452 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 453 ++count; 454 455 /* Drop packet? */ 456 if (loss_event(q)) { 457 if (q->ecn && INET_ECN_set_ce(skb)) 458 qdisc_qstats_drop(sch); /* mark packet */ 459 else 460 --count; 461 } 462 if (count == 0) { 463 qdisc_qstats_drop(sch); 464 __qdisc_drop(skb, to_free); 465 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 466 } 467 468 /* If a delay is expected, orphan the skb. (orphaning usually takes 469 * place at TX completion time, so _before_ the link transit delay) 470 */ 471 if (q->latency || q->jitter || q->rate) 472 skb_orphan_partial(skb); 473 474 /* 475 * If we need to duplicate packet, then re-insert at top of the 476 * qdisc tree, since parent queuer expects that only one 477 * skb will be queued. 478 */ 479 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 480 struct Qdisc *rootq = qdisc_root_bh(sch); 481 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 482 483 q->duplicate = 0; 484 rootq->enqueue(skb2, rootq, to_free); 485 q->duplicate = dupsave; 486 rc_drop = NET_XMIT_SUCCESS; 487 } 488 489 /* 490 * Randomized packet corruption. 491 * Make copy if needed since we are modifying 492 * If packet is going to be hardware checksummed, then 493 * do it now in software before we mangle it. 494 */ 495 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 496 if (skb_is_gso(skb)) { 497 skb = netem_segment(skb, sch, to_free); 498 if (!skb) 499 return rc_drop; 500 segs = skb->next; 501 skb_mark_not_on_list(skb); 502 qdisc_skb_cb(skb)->pkt_len = skb->len; 503 } 504 505 skb = skb_unshare(skb, GFP_ATOMIC); 506 if (unlikely(!skb)) { 507 qdisc_qstats_drop(sch); 508 goto finish_segs; 509 } 510 if (skb->ip_summed == CHECKSUM_PARTIAL && 511 skb_checksum_help(skb)) { 512 qdisc_drop(skb, sch, to_free); 513 skb = NULL; 514 goto finish_segs; 515 } 516 517 skb->data[get_random_u32_below(skb_headlen(skb))] ^= 518 1<<get_random_u32_below(8); 519 } 520 521 if (unlikely(sch->q.qlen >= sch->limit)) { 522 /* re-link segs, so that qdisc_drop_all() frees them all */ 523 skb->next = segs; 524 qdisc_drop_all(skb, sch, to_free); 525 return rc_drop; 526 } 527 528 qdisc_qstats_backlog_inc(sch, skb); 529 530 cb = netem_skb_cb(skb); 531 if (q->gap == 0 || /* not doing reordering */ 532 q->counter < q->gap - 1 || /* inside last reordering gap */ 533 q->reorder < get_crandom(&q->reorder_cor)) { 534 u64 now; 535 s64 delay; 536 537 delay = tabledist(q->latency, q->jitter, 538 &q->delay_cor, q->delay_dist); 539 540 now = ktime_get_ns(); 541 542 if (q->rate) { 543 struct netem_skb_cb *last = NULL; 544 545 if (sch->q.tail) 546 last = netem_skb_cb(sch->q.tail); 547 if (q->t_root.rb_node) { 548 struct sk_buff *t_skb; 549 struct netem_skb_cb *t_last; 550 551 t_skb = skb_rb_last(&q->t_root); 552 t_last = netem_skb_cb(t_skb); 553 if (!last || 554 t_last->time_to_send > last->time_to_send) 555 last = t_last; 556 } 557 if (q->t_tail) { 558 struct netem_skb_cb *t_last = 559 netem_skb_cb(q->t_tail); 560 561 if (!last || 562 t_last->time_to_send > last->time_to_send) 563 last = t_last; 564 } 565 566 if (last) { 567 /* 568 * Last packet in queue is reference point (now), 569 * calculate this time bonus and subtract 570 * from delay. 571 */ 572 delay -= last->time_to_send - now; 573 delay = max_t(s64, 0, delay); 574 now = last->time_to_send; 575 } 576 577 delay += packet_time_ns(qdisc_pkt_len(skb), q); 578 } 579 580 cb->time_to_send = now + delay; 581 ++q->counter; 582 tfifo_enqueue(skb, sch); 583 } else { 584 /* 585 * Do re-ordering by putting one out of N packets at the front 586 * of the queue. 587 */ 588 cb->time_to_send = ktime_get_ns(); 589 q->counter = 0; 590 591 __qdisc_enqueue_head(skb, &sch->q); 592 sch->qstats.requeues++; 593 } 594 595 finish_segs: 596 if (segs) { 597 unsigned int len, last_len; 598 int nb; 599 600 len = skb ? skb->len : 0; 601 nb = skb ? 1 : 0; 602 603 while (segs) { 604 skb2 = segs->next; 605 skb_mark_not_on_list(segs); 606 qdisc_skb_cb(segs)->pkt_len = segs->len; 607 last_len = segs->len; 608 rc = qdisc_enqueue(segs, sch, to_free); 609 if (rc != NET_XMIT_SUCCESS) { 610 if (net_xmit_drop_count(rc)) 611 qdisc_qstats_drop(sch); 612 } else { 613 nb++; 614 len += last_len; 615 } 616 segs = skb2; 617 } 618 /* Parent qdiscs accounted for 1 skb of size @prev_len */ 619 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); 620 } else if (!skb) { 621 return NET_XMIT_DROP; 622 } 623 return NET_XMIT_SUCCESS; 624 } 625 626 /* Delay the next round with a new future slot with a 627 * correct number of bytes and packets. 628 */ 629 630 static void get_slot_next(struct netem_sched_data *q, u64 now) 631 { 632 s64 next_delay; 633 634 if (!q->slot_dist) 635 next_delay = q->slot_config.min_delay + 636 (get_random_u32() * 637 (q->slot_config.max_delay - 638 q->slot_config.min_delay) >> 32); 639 else 640 next_delay = tabledist(q->slot_config.dist_delay, 641 (s32)(q->slot_config.dist_jitter), 642 NULL, q->slot_dist); 643 644 q->slot.slot_next = now + next_delay; 645 q->slot.packets_left = q->slot_config.max_packets; 646 q->slot.bytes_left = q->slot_config.max_bytes; 647 } 648 649 static struct sk_buff *netem_peek(struct netem_sched_data *q) 650 { 651 struct sk_buff *skb = skb_rb_first(&q->t_root); 652 u64 t1, t2; 653 654 if (!skb) 655 return q->t_head; 656 if (!q->t_head) 657 return skb; 658 659 t1 = netem_skb_cb(skb)->time_to_send; 660 t2 = netem_skb_cb(q->t_head)->time_to_send; 661 if (t1 < t2) 662 return skb; 663 return q->t_head; 664 } 665 666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 667 { 668 if (skb == q->t_head) { 669 q->t_head = skb->next; 670 if (!q->t_head) 671 q->t_tail = NULL; 672 } else { 673 rb_erase(&skb->rbnode, &q->t_root); 674 } 675 } 676 677 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 678 { 679 struct netem_sched_data *q = qdisc_priv(sch); 680 struct sk_buff *skb; 681 682 tfifo_dequeue: 683 skb = __qdisc_dequeue_head(&sch->q); 684 if (skb) { 685 qdisc_qstats_backlog_dec(sch, skb); 686 deliver: 687 qdisc_bstats_update(sch, skb); 688 return skb; 689 } 690 skb = netem_peek(q); 691 if (skb) { 692 u64 time_to_send; 693 u64 now = ktime_get_ns(); 694 695 /* if more time remaining? */ 696 time_to_send = netem_skb_cb(skb)->time_to_send; 697 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 698 get_slot_next(q, now); 699 700 if (time_to_send <= now && q->slot.slot_next <= now) { 701 netem_erase_head(q, skb); 702 sch->q.qlen--; 703 qdisc_qstats_backlog_dec(sch, skb); 704 skb->next = NULL; 705 skb->prev = NULL; 706 /* skb->dev shares skb->rbnode area, 707 * we need to restore its value. 708 */ 709 skb->dev = qdisc_dev(sch); 710 711 if (q->slot.slot_next) { 712 q->slot.packets_left--; 713 q->slot.bytes_left -= qdisc_pkt_len(skb); 714 if (q->slot.packets_left <= 0 || 715 q->slot.bytes_left <= 0) 716 get_slot_next(q, now); 717 } 718 719 if (q->qdisc) { 720 unsigned int pkt_len = qdisc_pkt_len(skb); 721 struct sk_buff *to_free = NULL; 722 int err; 723 724 err = qdisc_enqueue(skb, q->qdisc, &to_free); 725 kfree_skb_list(to_free); 726 if (err != NET_XMIT_SUCCESS && 727 net_xmit_drop_count(err)) { 728 qdisc_qstats_drop(sch); 729 qdisc_tree_reduce_backlog(sch, 1, 730 pkt_len); 731 } 732 goto tfifo_dequeue; 733 } 734 goto deliver; 735 } 736 737 if (q->qdisc) { 738 skb = q->qdisc->ops->dequeue(q->qdisc); 739 if (skb) 740 goto deliver; 741 } 742 743 qdisc_watchdog_schedule_ns(&q->watchdog, 744 max(time_to_send, 745 q->slot.slot_next)); 746 } 747 748 if (q->qdisc) { 749 skb = q->qdisc->ops->dequeue(q->qdisc); 750 if (skb) 751 goto deliver; 752 } 753 return NULL; 754 } 755 756 static void netem_reset(struct Qdisc *sch) 757 { 758 struct netem_sched_data *q = qdisc_priv(sch); 759 760 qdisc_reset_queue(sch); 761 tfifo_reset(sch); 762 if (q->qdisc) 763 qdisc_reset(q->qdisc); 764 qdisc_watchdog_cancel(&q->watchdog); 765 } 766 767 static void dist_free(struct disttable *d) 768 { 769 kvfree(d); 770 } 771 772 /* 773 * Distribution data is a variable size payload containing 774 * signed 16 bit values. 775 */ 776 777 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, 778 const struct nlattr *attr) 779 { 780 size_t n = nla_len(attr)/sizeof(__s16); 781 const __s16 *data = nla_data(attr); 782 spinlock_t *root_lock; 783 struct disttable *d; 784 int i; 785 786 if (!n || n > NETEM_DIST_MAX) 787 return -EINVAL; 788 789 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); 790 if (!d) 791 return -ENOMEM; 792 793 d->size = n; 794 for (i = 0; i < n; i++) 795 d->table[i] = data[i]; 796 797 root_lock = qdisc_root_sleeping_lock(sch); 798 799 spin_lock_bh(root_lock); 800 swap(*tbl, d); 801 spin_unlock_bh(root_lock); 802 803 dist_free(d); 804 return 0; 805 } 806 807 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 808 { 809 const struct tc_netem_slot *c = nla_data(attr); 810 811 q->slot_config = *c; 812 if (q->slot_config.max_packets == 0) 813 q->slot_config.max_packets = INT_MAX; 814 if (q->slot_config.max_bytes == 0) 815 q->slot_config.max_bytes = INT_MAX; 816 817 /* capping dist_jitter to the range acceptable by tabledist() */ 818 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); 819 820 q->slot.packets_left = q->slot_config.max_packets; 821 q->slot.bytes_left = q->slot_config.max_bytes; 822 if (q->slot_config.min_delay | q->slot_config.max_delay | 823 q->slot_config.dist_jitter) 824 q->slot.slot_next = ktime_get_ns(); 825 else 826 q->slot.slot_next = 0; 827 } 828 829 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 830 { 831 const struct tc_netem_corr *c = nla_data(attr); 832 833 init_crandom(&q->delay_cor, c->delay_corr); 834 init_crandom(&q->loss_cor, c->loss_corr); 835 init_crandom(&q->dup_cor, c->dup_corr); 836 } 837 838 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 839 { 840 const struct tc_netem_reorder *r = nla_data(attr); 841 842 q->reorder = r->probability; 843 init_crandom(&q->reorder_cor, r->correlation); 844 } 845 846 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 847 { 848 const struct tc_netem_corrupt *r = nla_data(attr); 849 850 q->corrupt = r->probability; 851 init_crandom(&q->corrupt_cor, r->correlation); 852 } 853 854 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 855 { 856 const struct tc_netem_rate *r = nla_data(attr); 857 858 q->rate = r->rate; 859 q->packet_overhead = r->packet_overhead; 860 q->cell_size = r->cell_size; 861 q->cell_overhead = r->cell_overhead; 862 if (q->cell_size) 863 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 864 else 865 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 866 } 867 868 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 869 { 870 const struct nlattr *la; 871 int rem; 872 873 nla_for_each_nested(la, attr, rem) { 874 u16 type = nla_type(la); 875 876 switch (type) { 877 case NETEM_LOSS_GI: { 878 const struct tc_netem_gimodel *gi = nla_data(la); 879 880 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 881 pr_info("netem: incorrect gi model size\n"); 882 return -EINVAL; 883 } 884 885 q->loss_model = CLG_4_STATES; 886 887 q->clg.state = TX_IN_GAP_PERIOD; 888 q->clg.a1 = gi->p13; 889 q->clg.a2 = gi->p31; 890 q->clg.a3 = gi->p32; 891 q->clg.a4 = gi->p14; 892 q->clg.a5 = gi->p23; 893 break; 894 } 895 896 case NETEM_LOSS_GE: { 897 const struct tc_netem_gemodel *ge = nla_data(la); 898 899 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 900 pr_info("netem: incorrect ge model size\n"); 901 return -EINVAL; 902 } 903 904 q->loss_model = CLG_GILB_ELL; 905 q->clg.state = GOOD_STATE; 906 q->clg.a1 = ge->p; 907 q->clg.a2 = ge->r; 908 q->clg.a3 = ge->h; 909 q->clg.a4 = ge->k1; 910 break; 911 } 912 913 default: 914 pr_info("netem: unknown loss type %u\n", type); 915 return -EINVAL; 916 } 917 } 918 919 return 0; 920 } 921 922 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 923 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 924 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 925 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 926 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 927 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 928 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 929 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 930 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 931 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 932 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 933 }; 934 935 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 936 const struct nla_policy *policy, int len) 937 { 938 int nested_len = nla_len(nla) - NLA_ALIGN(len); 939 940 if (nested_len < 0) { 941 pr_info("netem: invalid attributes len %d\n", nested_len); 942 return -EINVAL; 943 } 944 945 if (nested_len >= nla_attr_size(0)) 946 return nla_parse_deprecated(tb, maxtype, 947 nla_data(nla) + NLA_ALIGN(len), 948 nested_len, policy, NULL); 949 950 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 951 return 0; 952 } 953 954 /* Parse netlink message to set options */ 955 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 956 struct netlink_ext_ack *extack) 957 { 958 struct netem_sched_data *q = qdisc_priv(sch); 959 struct nlattr *tb[TCA_NETEM_MAX + 1]; 960 struct tc_netem_qopt *qopt; 961 struct clgstate old_clg; 962 int old_loss_model = CLG_RANDOM; 963 int ret; 964 965 qopt = nla_data(opt); 966 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 967 if (ret < 0) 968 return ret; 969 970 /* backup q->clg and q->loss_model */ 971 old_clg = q->clg; 972 old_loss_model = q->loss_model; 973 974 if (tb[TCA_NETEM_LOSS]) { 975 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 976 if (ret) { 977 q->loss_model = old_loss_model; 978 return ret; 979 } 980 } else { 981 q->loss_model = CLG_RANDOM; 982 } 983 984 if (tb[TCA_NETEM_DELAY_DIST]) { 985 ret = get_dist_table(sch, &q->delay_dist, 986 tb[TCA_NETEM_DELAY_DIST]); 987 if (ret) 988 goto get_table_failure; 989 } 990 991 if (tb[TCA_NETEM_SLOT_DIST]) { 992 ret = get_dist_table(sch, &q->slot_dist, 993 tb[TCA_NETEM_SLOT_DIST]); 994 if (ret) 995 goto get_table_failure; 996 } 997 998 sch->limit = qopt->limit; 999 1000 q->latency = PSCHED_TICKS2NS(qopt->latency); 1001 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 1002 q->limit = qopt->limit; 1003 q->gap = qopt->gap; 1004 q->counter = 0; 1005 q->loss = qopt->loss; 1006 q->duplicate = qopt->duplicate; 1007 1008 /* for compatibility with earlier versions. 1009 * if gap is set, need to assume 100% probability 1010 */ 1011 if (q->gap) 1012 q->reorder = ~0; 1013 1014 if (tb[TCA_NETEM_CORR]) 1015 get_correlation(q, tb[TCA_NETEM_CORR]); 1016 1017 if (tb[TCA_NETEM_REORDER]) 1018 get_reorder(q, tb[TCA_NETEM_REORDER]); 1019 1020 if (tb[TCA_NETEM_CORRUPT]) 1021 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1022 1023 if (tb[TCA_NETEM_RATE]) 1024 get_rate(q, tb[TCA_NETEM_RATE]); 1025 1026 if (tb[TCA_NETEM_RATE64]) 1027 q->rate = max_t(u64, q->rate, 1028 nla_get_u64(tb[TCA_NETEM_RATE64])); 1029 1030 if (tb[TCA_NETEM_LATENCY64]) 1031 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1032 1033 if (tb[TCA_NETEM_JITTER64]) 1034 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1035 1036 if (tb[TCA_NETEM_ECN]) 1037 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1038 1039 if (tb[TCA_NETEM_SLOT]) 1040 get_slot(q, tb[TCA_NETEM_SLOT]); 1041 1042 /* capping jitter to the range acceptable by tabledist() */ 1043 q->jitter = min_t(s64, abs(q->jitter), INT_MAX); 1044 1045 return ret; 1046 1047 get_table_failure: 1048 /* recover clg and loss_model, in case of 1049 * q->clg and q->loss_model were modified 1050 * in get_loss_clg() 1051 */ 1052 q->clg = old_clg; 1053 q->loss_model = old_loss_model; 1054 return ret; 1055 } 1056 1057 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1058 struct netlink_ext_ack *extack) 1059 { 1060 struct netem_sched_data *q = qdisc_priv(sch); 1061 int ret; 1062 1063 qdisc_watchdog_init(&q->watchdog, sch); 1064 1065 if (!opt) 1066 return -EINVAL; 1067 1068 q->loss_model = CLG_RANDOM; 1069 ret = netem_change(sch, opt, extack); 1070 if (ret) 1071 pr_info("netem: change failed\n"); 1072 return ret; 1073 } 1074 1075 static void netem_destroy(struct Qdisc *sch) 1076 { 1077 struct netem_sched_data *q = qdisc_priv(sch); 1078 1079 qdisc_watchdog_cancel(&q->watchdog); 1080 if (q->qdisc) 1081 qdisc_put(q->qdisc); 1082 dist_free(q->delay_dist); 1083 dist_free(q->slot_dist); 1084 } 1085 1086 static int dump_loss_model(const struct netem_sched_data *q, 1087 struct sk_buff *skb) 1088 { 1089 struct nlattr *nest; 1090 1091 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); 1092 if (nest == NULL) 1093 goto nla_put_failure; 1094 1095 switch (q->loss_model) { 1096 case CLG_RANDOM: 1097 /* legacy loss model */ 1098 nla_nest_cancel(skb, nest); 1099 return 0; /* no data */ 1100 1101 case CLG_4_STATES: { 1102 struct tc_netem_gimodel gi = { 1103 .p13 = q->clg.a1, 1104 .p31 = q->clg.a2, 1105 .p32 = q->clg.a3, 1106 .p14 = q->clg.a4, 1107 .p23 = q->clg.a5, 1108 }; 1109 1110 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1111 goto nla_put_failure; 1112 break; 1113 } 1114 case CLG_GILB_ELL: { 1115 struct tc_netem_gemodel ge = { 1116 .p = q->clg.a1, 1117 .r = q->clg.a2, 1118 .h = q->clg.a3, 1119 .k1 = q->clg.a4, 1120 }; 1121 1122 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1123 goto nla_put_failure; 1124 break; 1125 } 1126 } 1127 1128 nla_nest_end(skb, nest); 1129 return 0; 1130 1131 nla_put_failure: 1132 nla_nest_cancel(skb, nest); 1133 return -1; 1134 } 1135 1136 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1137 { 1138 const struct netem_sched_data *q = qdisc_priv(sch); 1139 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1140 struct tc_netem_qopt qopt; 1141 struct tc_netem_corr cor; 1142 struct tc_netem_reorder reorder; 1143 struct tc_netem_corrupt corrupt; 1144 struct tc_netem_rate rate; 1145 struct tc_netem_slot slot; 1146 1147 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), 1148 UINT_MAX); 1149 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), 1150 UINT_MAX); 1151 qopt.limit = q->limit; 1152 qopt.loss = q->loss; 1153 qopt.gap = q->gap; 1154 qopt.duplicate = q->duplicate; 1155 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1156 goto nla_put_failure; 1157 1158 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1159 goto nla_put_failure; 1160 1161 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1162 goto nla_put_failure; 1163 1164 cor.delay_corr = q->delay_cor.rho; 1165 cor.loss_corr = q->loss_cor.rho; 1166 cor.dup_corr = q->dup_cor.rho; 1167 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1168 goto nla_put_failure; 1169 1170 reorder.probability = q->reorder; 1171 reorder.correlation = q->reorder_cor.rho; 1172 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1173 goto nla_put_failure; 1174 1175 corrupt.probability = q->corrupt; 1176 corrupt.correlation = q->corrupt_cor.rho; 1177 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1178 goto nla_put_failure; 1179 1180 if (q->rate >= (1ULL << 32)) { 1181 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1182 TCA_NETEM_PAD)) 1183 goto nla_put_failure; 1184 rate.rate = ~0U; 1185 } else { 1186 rate.rate = q->rate; 1187 } 1188 rate.packet_overhead = q->packet_overhead; 1189 rate.cell_size = q->cell_size; 1190 rate.cell_overhead = q->cell_overhead; 1191 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1192 goto nla_put_failure; 1193 1194 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1195 goto nla_put_failure; 1196 1197 if (dump_loss_model(q, skb) != 0) 1198 goto nla_put_failure; 1199 1200 if (q->slot_config.min_delay | q->slot_config.max_delay | 1201 q->slot_config.dist_jitter) { 1202 slot = q->slot_config; 1203 if (slot.max_packets == INT_MAX) 1204 slot.max_packets = 0; 1205 if (slot.max_bytes == INT_MAX) 1206 slot.max_bytes = 0; 1207 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1208 goto nla_put_failure; 1209 } 1210 1211 return nla_nest_end(skb, nla); 1212 1213 nla_put_failure: 1214 nlmsg_trim(skb, nla); 1215 return -1; 1216 } 1217 1218 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1219 struct sk_buff *skb, struct tcmsg *tcm) 1220 { 1221 struct netem_sched_data *q = qdisc_priv(sch); 1222 1223 if (cl != 1 || !q->qdisc) /* only one class */ 1224 return -ENOENT; 1225 1226 tcm->tcm_handle |= TC_H_MIN(1); 1227 tcm->tcm_info = q->qdisc->handle; 1228 1229 return 0; 1230 } 1231 1232 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1233 struct Qdisc **old, struct netlink_ext_ack *extack) 1234 { 1235 struct netem_sched_data *q = qdisc_priv(sch); 1236 1237 *old = qdisc_replace(sch, new, &q->qdisc); 1238 return 0; 1239 } 1240 1241 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1242 { 1243 struct netem_sched_data *q = qdisc_priv(sch); 1244 return q->qdisc; 1245 } 1246 1247 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1248 { 1249 return 1; 1250 } 1251 1252 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1253 { 1254 if (!walker->stop) { 1255 if (!tc_qdisc_stats_dump(sch, 1, walker)) 1256 return; 1257 } 1258 } 1259 1260 static const struct Qdisc_class_ops netem_class_ops = { 1261 .graft = netem_graft, 1262 .leaf = netem_leaf, 1263 .find = netem_find, 1264 .walk = netem_walk, 1265 .dump = netem_dump_class, 1266 }; 1267 1268 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1269 .id = "netem", 1270 .cl_ops = &netem_class_ops, 1271 .priv_size = sizeof(struct netem_sched_data), 1272 .enqueue = netem_enqueue, 1273 .dequeue = netem_dequeue, 1274 .peek = qdisc_peek_dequeued, 1275 .init = netem_init, 1276 .reset = netem_reset, 1277 .destroy = netem_destroy, 1278 .change = netem_change, 1279 .dump = netem_dump, 1280 .owner = THIS_MODULE, 1281 }; 1282 1283 1284 static int __init netem_module_init(void) 1285 { 1286 pr_info("netem: version " VERSION "\n"); 1287 return register_qdisc(&netem_qdisc_ops); 1288 } 1289 static void __exit netem_module_exit(void) 1290 { 1291 unregister_qdisc(&netem_qdisc_ops); 1292 } 1293 module_init(netem_module_init) 1294 module_exit(netem_module_exit) 1295 MODULE_LICENSE("GPL"); 1296