1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * net/sched/sch_netem.c Network emulator 4 * 5 * Many of the algorithms and ideas for this came from 6 * NIST Net which is not copyrighted. 7 * 8 * Authors: Stephen Hemminger <shemminger@osdl.org> 9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 10 */ 11 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/kernel.h> 17 #include <linux/errno.h> 18 #include <linux/skbuff.h> 19 #include <linux/vmalloc.h> 20 #include <linux/rtnetlink.h> 21 #include <linux/reciprocal_div.h> 22 #include <linux/rbtree.h> 23 24 #include <net/gso.h> 25 #include <net/netlink.h> 26 #include <net/pkt_sched.h> 27 #include <net/inet_ecn.h> 28 29 #define VERSION "1.3" 30 31 /* Network Emulation Queuing algorithm. 32 ==================================== 33 34 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 35 Network Emulation Tool 36 [2] Luigi Rizzo, DummyNet for FreeBSD 37 38 ---------------------------------------------------------------- 39 40 This started out as a simple way to delay outgoing packets to 41 test TCP but has grown to include most of the functionality 42 of a full blown network emulator like NISTnet. It can delay 43 packets and add random jitter (and correlation). The random 44 distribution can be loaded from a table as well to provide 45 normal, Pareto, or experimental curves. Packet loss, 46 duplication, and reordering can also be emulated. 47 48 This qdisc does not do classification that can be handled in 49 layering other disciplines. It does not need to do bandwidth 50 control either since that can be handled by using token 51 bucket or other rate control. 52 53 Correlated Loss Generator models 54 55 Added generation of correlated loss according to the 56 "Gilbert-Elliot" model, a 4-state markov model. 57 58 References: 59 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 60 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 61 and intuitive loss model for packet networks and its implementation 62 in the Netem module in the Linux kernel", available in [1] 63 64 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 65 Fabio Ludovici <fabio.ludovici at yahoo.it> 66 */ 67 68 struct disttable { 69 u32 size; 70 s16 table[]; 71 }; 72 73 struct netem_sched_data { 74 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 75 struct rb_root t_root; 76 77 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 78 struct sk_buff *t_head; 79 struct sk_buff *t_tail; 80 81 /* optional qdisc for classful handling (NULL at netem init) */ 82 struct Qdisc *qdisc; 83 84 struct qdisc_watchdog watchdog; 85 86 s64 latency; 87 s64 jitter; 88 89 u32 loss; 90 u32 ecn; 91 u32 limit; 92 u32 counter; 93 u32 gap; 94 u32 duplicate; 95 u32 reorder; 96 u32 corrupt; 97 u64 rate; 98 s32 packet_overhead; 99 u32 cell_size; 100 struct reciprocal_value cell_size_reciprocal; 101 s32 cell_overhead; 102 103 struct crndstate { 104 u32 last; 105 u32 rho; 106 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 107 108 struct disttable *delay_dist; 109 110 enum { 111 CLG_RANDOM, 112 CLG_4_STATES, 113 CLG_GILB_ELL, 114 } loss_model; 115 116 enum { 117 TX_IN_GAP_PERIOD = 1, 118 TX_IN_BURST_PERIOD, 119 LOST_IN_GAP_PERIOD, 120 LOST_IN_BURST_PERIOD, 121 } _4_state_model; 122 123 enum { 124 GOOD_STATE = 1, 125 BAD_STATE, 126 } GE_state_model; 127 128 /* Correlated Loss Generation models */ 129 struct clgstate { 130 /* state of the Markov chain */ 131 u8 state; 132 133 /* 4-states and Gilbert-Elliot models */ 134 u32 a1; /* p13 for 4-states or p for GE */ 135 u32 a2; /* p31 for 4-states or r for GE */ 136 u32 a3; /* p32 for 4-states or h for GE */ 137 u32 a4; /* p14 for 4-states or 1-k for GE */ 138 u32 a5; /* p23 used only in 4-states */ 139 } clg; 140 141 struct tc_netem_slot slot_config; 142 struct slotstate { 143 u64 slot_next; 144 s32 packets_left; 145 s32 bytes_left; 146 } slot; 147 148 struct disttable *slot_dist; 149 }; 150 151 /* Time stamp put into socket buffer control block 152 * Only valid when skbs are in our internal t(ime)fifo queue. 153 * 154 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 155 * and skb->next & skb->prev are scratch space for a qdisc, 156 * we save skb->tstamp value in skb->cb[] before destroying it. 157 */ 158 struct netem_skb_cb { 159 u64 time_to_send; 160 }; 161 162 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 163 { 164 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 165 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 166 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 167 } 168 169 /* init_crandom - initialize correlated random number generator 170 * Use entropy source for initial seed. 171 */ 172 static void init_crandom(struct crndstate *state, unsigned long rho) 173 { 174 state->rho = rho; 175 state->last = get_random_u32(); 176 } 177 178 /* get_crandom - correlated random number generator 179 * Next number depends on last value. 180 * rho is scaled to avoid floating point. 181 */ 182 static u32 get_crandom(struct crndstate *state) 183 { 184 u64 value, rho; 185 unsigned long answer; 186 187 if (!state || state->rho == 0) /* no correlation */ 188 return get_random_u32(); 189 190 value = get_random_u32(); 191 rho = (u64)state->rho + 1; 192 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 193 state->last = answer; 194 return answer; 195 } 196 197 /* loss_4state - 4-state model loss generator 198 * Generates losses according to the 4-state Markov chain adopted in 199 * the GI (General and Intuitive) loss model. 200 */ 201 static bool loss_4state(struct netem_sched_data *q) 202 { 203 struct clgstate *clg = &q->clg; 204 u32 rnd = get_random_u32(); 205 206 /* 207 * Makes a comparison between rnd and the transition 208 * probabilities outgoing from the current state, then decides the 209 * next state and if the next packet has to be transmitted or lost. 210 * The four states correspond to: 211 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 212 * LOST_IN_GAP_PERIOD => isolated losses within a gap period 213 * LOST_IN_BURST_PERIOD => lost packets within a burst period 214 * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period 215 */ 216 switch (clg->state) { 217 case TX_IN_GAP_PERIOD: 218 if (rnd < clg->a4) { 219 clg->state = LOST_IN_GAP_PERIOD; 220 return true; 221 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 222 clg->state = LOST_IN_BURST_PERIOD; 223 return true; 224 } else if (clg->a1 + clg->a4 < rnd) { 225 clg->state = TX_IN_GAP_PERIOD; 226 } 227 228 break; 229 case TX_IN_BURST_PERIOD: 230 if (rnd < clg->a5) { 231 clg->state = LOST_IN_BURST_PERIOD; 232 return true; 233 } else { 234 clg->state = TX_IN_BURST_PERIOD; 235 } 236 237 break; 238 case LOST_IN_BURST_PERIOD: 239 if (rnd < clg->a3) 240 clg->state = TX_IN_BURST_PERIOD; 241 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 242 clg->state = TX_IN_GAP_PERIOD; 243 } else if (clg->a2 + clg->a3 < rnd) { 244 clg->state = LOST_IN_BURST_PERIOD; 245 return true; 246 } 247 break; 248 case LOST_IN_GAP_PERIOD: 249 clg->state = TX_IN_GAP_PERIOD; 250 break; 251 } 252 253 return false; 254 } 255 256 /* loss_gilb_ell - Gilbert-Elliot model loss generator 257 * Generates losses according to the Gilbert-Elliot loss model or 258 * its special cases (Gilbert or Simple Gilbert) 259 * 260 * Makes a comparison between random number and the transition 261 * probabilities outgoing from the current state, then decides the 262 * next state. A second random number is extracted and the comparison 263 * with the loss probability of the current state decides if the next 264 * packet will be transmitted or lost. 265 */ 266 static bool loss_gilb_ell(struct netem_sched_data *q) 267 { 268 struct clgstate *clg = &q->clg; 269 270 switch (clg->state) { 271 case GOOD_STATE: 272 if (get_random_u32() < clg->a1) 273 clg->state = BAD_STATE; 274 if (get_random_u32() < clg->a4) 275 return true; 276 break; 277 case BAD_STATE: 278 if (get_random_u32() < clg->a2) 279 clg->state = GOOD_STATE; 280 if (get_random_u32() > clg->a3) 281 return true; 282 } 283 284 return false; 285 } 286 287 static bool loss_event(struct netem_sched_data *q) 288 { 289 switch (q->loss_model) { 290 case CLG_RANDOM: 291 /* Random packet drop 0 => none, ~0 => all */ 292 return q->loss && q->loss >= get_crandom(&q->loss_cor); 293 294 case CLG_4_STATES: 295 /* 4state loss model algorithm (used also for GI model) 296 * Extracts a value from the markov 4 state loss generator, 297 * if it is 1 drops a packet and if needed writes the event in 298 * the kernel logs 299 */ 300 return loss_4state(q); 301 302 case CLG_GILB_ELL: 303 /* Gilbert-Elliot loss model algorithm 304 * Extracts a value from the Gilbert-Elliot loss generator, 305 * if it is 1 drops a packet and if needed writes the event in 306 * the kernel logs 307 */ 308 return loss_gilb_ell(q); 309 } 310 311 return false; /* not reached */ 312 } 313 314 315 /* tabledist - return a pseudo-randomly distributed value with mean mu and 316 * std deviation sigma. Uses table lookup to approximate the desired 317 * distribution, and a uniformly-distributed pseudo-random source. 318 */ 319 static s64 tabledist(s64 mu, s32 sigma, 320 struct crndstate *state, 321 const struct disttable *dist) 322 { 323 s64 x; 324 long t; 325 u32 rnd; 326 327 if (sigma == 0) 328 return mu; 329 330 rnd = get_crandom(state); 331 332 /* default uniform distribution */ 333 if (dist == NULL) 334 return ((rnd % (2 * (u32)sigma)) + mu) - sigma; 335 336 t = dist->table[rnd % dist->size]; 337 x = (sigma % NETEM_DIST_SCALE) * t; 338 if (x >= 0) 339 x += NETEM_DIST_SCALE/2; 340 else 341 x -= NETEM_DIST_SCALE/2; 342 343 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 344 } 345 346 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 347 { 348 len += q->packet_overhead; 349 350 if (q->cell_size) { 351 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 352 353 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 354 cells++; 355 len = cells * (q->cell_size + q->cell_overhead); 356 } 357 358 return div64_u64(len * NSEC_PER_SEC, q->rate); 359 } 360 361 static void tfifo_reset(struct Qdisc *sch) 362 { 363 struct netem_sched_data *q = qdisc_priv(sch); 364 struct rb_node *p = rb_first(&q->t_root); 365 366 while (p) { 367 struct sk_buff *skb = rb_to_skb(p); 368 369 p = rb_next(p); 370 rb_erase(&skb->rbnode, &q->t_root); 371 rtnl_kfree_skbs(skb, skb); 372 } 373 374 rtnl_kfree_skbs(q->t_head, q->t_tail); 375 q->t_head = NULL; 376 q->t_tail = NULL; 377 } 378 379 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 380 { 381 struct netem_sched_data *q = qdisc_priv(sch); 382 u64 tnext = netem_skb_cb(nskb)->time_to_send; 383 384 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 385 if (q->t_tail) 386 q->t_tail->next = nskb; 387 else 388 q->t_head = nskb; 389 q->t_tail = nskb; 390 } else { 391 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 392 393 while (*p) { 394 struct sk_buff *skb; 395 396 parent = *p; 397 skb = rb_to_skb(parent); 398 if (tnext >= netem_skb_cb(skb)->time_to_send) 399 p = &parent->rb_right; 400 else 401 p = &parent->rb_left; 402 } 403 rb_link_node(&nskb->rbnode, parent, p); 404 rb_insert_color(&nskb->rbnode, &q->t_root); 405 } 406 sch->q.qlen++; 407 } 408 409 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 410 * when we statistically choose to corrupt one, we instead segment it, returning 411 * the first packet to be corrupted, and re-enqueue the remaining frames 412 */ 413 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 414 struct sk_buff **to_free) 415 { 416 struct sk_buff *segs; 417 netdev_features_t features = netif_skb_features(skb); 418 419 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 420 421 if (IS_ERR_OR_NULL(segs)) { 422 qdisc_drop(skb, sch, to_free); 423 return NULL; 424 } 425 consume_skb(skb); 426 return segs; 427 } 428 429 /* 430 * Insert one skb into qdisc. 431 * Note: parent depends on return value to account for queue length. 432 * NET_XMIT_DROP: queue length didn't change. 433 * NET_XMIT_SUCCESS: one skb was queued. 434 */ 435 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 436 struct sk_buff **to_free) 437 { 438 struct netem_sched_data *q = qdisc_priv(sch); 439 /* We don't fill cb now as skb_unshare() may invalidate it */ 440 struct netem_skb_cb *cb; 441 struct sk_buff *skb2; 442 struct sk_buff *segs = NULL; 443 unsigned int prev_len = qdisc_pkt_len(skb); 444 int count = 1; 445 int rc = NET_XMIT_SUCCESS; 446 int rc_drop = NET_XMIT_DROP; 447 448 /* Do not fool qdisc_drop_all() */ 449 skb->prev = NULL; 450 451 /* Random duplication */ 452 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 453 ++count; 454 455 /* Drop packet? */ 456 if (loss_event(q)) { 457 if (q->ecn && INET_ECN_set_ce(skb)) 458 qdisc_qstats_drop(sch); /* mark packet */ 459 else 460 --count; 461 } 462 if (count == 0) { 463 qdisc_qstats_drop(sch); 464 __qdisc_drop(skb, to_free); 465 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 466 } 467 468 /* If a delay is expected, orphan the skb. (orphaning usually takes 469 * place at TX completion time, so _before_ the link transit delay) 470 */ 471 if (q->latency || q->jitter || q->rate) 472 skb_orphan_partial(skb); 473 474 /* 475 * If we need to duplicate packet, then re-insert at top of the 476 * qdisc tree, since parent queuer expects that only one 477 * skb will be queued. 478 */ 479 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 480 struct Qdisc *rootq = qdisc_root_bh(sch); 481 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 482 483 q->duplicate = 0; 484 rootq->enqueue(skb2, rootq, to_free); 485 q->duplicate = dupsave; 486 rc_drop = NET_XMIT_SUCCESS; 487 } 488 489 /* 490 * Randomized packet corruption. 491 * Make copy if needed since we are modifying 492 * If packet is going to be hardware checksummed, then 493 * do it now in software before we mangle it. 494 */ 495 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 496 if (skb_is_gso(skb)) { 497 skb = netem_segment(skb, sch, to_free); 498 if (!skb) 499 return rc_drop; 500 segs = skb->next; 501 skb_mark_not_on_list(skb); 502 qdisc_skb_cb(skb)->pkt_len = skb->len; 503 } 504 505 skb = skb_unshare(skb, GFP_ATOMIC); 506 if (unlikely(!skb)) { 507 qdisc_qstats_drop(sch); 508 goto finish_segs; 509 } 510 if (skb->ip_summed == CHECKSUM_PARTIAL && 511 skb_checksum_help(skb)) { 512 qdisc_drop(skb, sch, to_free); 513 skb = NULL; 514 goto finish_segs; 515 } 516 517 skb->data[get_random_u32_below(skb_headlen(skb))] ^= 518 1<<get_random_u32_below(8); 519 } 520 521 if (unlikely(sch->q.qlen >= sch->limit)) { 522 /* re-link segs, so that qdisc_drop_all() frees them all */ 523 skb->next = segs; 524 qdisc_drop_all(skb, sch, to_free); 525 return rc_drop; 526 } 527 528 qdisc_qstats_backlog_inc(sch, skb); 529 530 cb = netem_skb_cb(skb); 531 if (q->gap == 0 || /* not doing reordering */ 532 q->counter < q->gap - 1 || /* inside last reordering gap */ 533 q->reorder < get_crandom(&q->reorder_cor)) { 534 u64 now; 535 s64 delay; 536 537 delay = tabledist(q->latency, q->jitter, 538 &q->delay_cor, q->delay_dist); 539 540 now = ktime_get_ns(); 541 542 if (q->rate) { 543 struct netem_skb_cb *last = NULL; 544 545 if (sch->q.tail) 546 last = netem_skb_cb(sch->q.tail); 547 if (q->t_root.rb_node) { 548 struct sk_buff *t_skb; 549 struct netem_skb_cb *t_last; 550 551 t_skb = skb_rb_last(&q->t_root); 552 t_last = netem_skb_cb(t_skb); 553 if (!last || 554 t_last->time_to_send > last->time_to_send) 555 last = t_last; 556 } 557 if (q->t_tail) { 558 struct netem_skb_cb *t_last = 559 netem_skb_cb(q->t_tail); 560 561 if (!last || 562 t_last->time_to_send > last->time_to_send) 563 last = t_last; 564 } 565 566 if (last) { 567 /* 568 * Last packet in queue is reference point (now), 569 * calculate this time bonus and subtract 570 * from delay. 571 */ 572 delay -= last->time_to_send - now; 573 delay = max_t(s64, 0, delay); 574 now = last->time_to_send; 575 } 576 577 delay += packet_time_ns(qdisc_pkt_len(skb), q); 578 } 579 580 cb->time_to_send = now + delay; 581 ++q->counter; 582 tfifo_enqueue(skb, sch); 583 } else { 584 /* 585 * Do re-ordering by putting one out of N packets at the front 586 * of the queue. 587 */ 588 cb->time_to_send = ktime_get_ns(); 589 q->counter = 0; 590 591 __qdisc_enqueue_head(skb, &sch->q); 592 sch->qstats.requeues++; 593 } 594 595 finish_segs: 596 if (segs) { 597 unsigned int len, last_len; 598 int nb; 599 600 len = skb ? skb->len : 0; 601 nb = skb ? 1 : 0; 602 603 while (segs) { 604 skb2 = segs->next; 605 skb_mark_not_on_list(segs); 606 qdisc_skb_cb(segs)->pkt_len = segs->len; 607 last_len = segs->len; 608 rc = qdisc_enqueue(segs, sch, to_free); 609 if (rc != NET_XMIT_SUCCESS) { 610 if (net_xmit_drop_count(rc)) 611 qdisc_qstats_drop(sch); 612 } else { 613 nb++; 614 len += last_len; 615 } 616 segs = skb2; 617 } 618 /* Parent qdiscs accounted for 1 skb of size @prev_len */ 619 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); 620 } else if (!skb) { 621 return NET_XMIT_DROP; 622 } 623 return NET_XMIT_SUCCESS; 624 } 625 626 /* Delay the next round with a new future slot with a 627 * correct number of bytes and packets. 628 */ 629 630 static void get_slot_next(struct netem_sched_data *q, u64 now) 631 { 632 s64 next_delay; 633 634 if (!q->slot_dist) 635 next_delay = q->slot_config.min_delay + 636 (get_random_u32() * 637 (q->slot_config.max_delay - 638 q->slot_config.min_delay) >> 32); 639 else 640 next_delay = tabledist(q->slot_config.dist_delay, 641 (s32)(q->slot_config.dist_jitter), 642 NULL, q->slot_dist); 643 644 q->slot.slot_next = now + next_delay; 645 q->slot.packets_left = q->slot_config.max_packets; 646 q->slot.bytes_left = q->slot_config.max_bytes; 647 } 648 649 static struct sk_buff *netem_peek(struct netem_sched_data *q) 650 { 651 struct sk_buff *skb = skb_rb_first(&q->t_root); 652 u64 t1, t2; 653 654 if (!skb) 655 return q->t_head; 656 if (!q->t_head) 657 return skb; 658 659 t1 = netem_skb_cb(skb)->time_to_send; 660 t2 = netem_skb_cb(q->t_head)->time_to_send; 661 if (t1 < t2) 662 return skb; 663 return q->t_head; 664 } 665 666 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 667 { 668 if (skb == q->t_head) { 669 q->t_head = skb->next; 670 if (!q->t_head) 671 q->t_tail = NULL; 672 } else { 673 rb_erase(&skb->rbnode, &q->t_root); 674 } 675 } 676 677 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 678 { 679 struct netem_sched_data *q = qdisc_priv(sch); 680 struct sk_buff *skb; 681 682 tfifo_dequeue: 683 skb = __qdisc_dequeue_head(&sch->q); 684 if (skb) { 685 qdisc_qstats_backlog_dec(sch, skb); 686 deliver: 687 qdisc_bstats_update(sch, skb); 688 return skb; 689 } 690 skb = netem_peek(q); 691 if (skb) { 692 u64 time_to_send; 693 u64 now = ktime_get_ns(); 694 695 /* if more time remaining? */ 696 time_to_send = netem_skb_cb(skb)->time_to_send; 697 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 698 get_slot_next(q, now); 699 700 if (time_to_send <= now && q->slot.slot_next <= now) { 701 netem_erase_head(q, skb); 702 sch->q.qlen--; 703 qdisc_qstats_backlog_dec(sch, skb); 704 skb->next = NULL; 705 skb->prev = NULL; 706 /* skb->dev shares skb->rbnode area, 707 * we need to restore its value. 708 */ 709 skb->dev = qdisc_dev(sch); 710 711 if (q->slot.slot_next) { 712 q->slot.packets_left--; 713 q->slot.bytes_left -= qdisc_pkt_len(skb); 714 if (q->slot.packets_left <= 0 || 715 q->slot.bytes_left <= 0) 716 get_slot_next(q, now); 717 } 718 719 if (q->qdisc) { 720 unsigned int pkt_len = qdisc_pkt_len(skb); 721 struct sk_buff *to_free = NULL; 722 int err; 723 724 err = qdisc_enqueue(skb, q->qdisc, &to_free); 725 kfree_skb_list(to_free); 726 if (err != NET_XMIT_SUCCESS && 727 net_xmit_drop_count(err)) { 728 qdisc_qstats_drop(sch); 729 qdisc_tree_reduce_backlog(sch, 1, 730 pkt_len); 731 } 732 goto tfifo_dequeue; 733 } 734 goto deliver; 735 } 736 737 if (q->qdisc) { 738 skb = q->qdisc->ops->dequeue(q->qdisc); 739 if (skb) 740 goto deliver; 741 } 742 743 qdisc_watchdog_schedule_ns(&q->watchdog, 744 max(time_to_send, 745 q->slot.slot_next)); 746 } 747 748 if (q->qdisc) { 749 skb = q->qdisc->ops->dequeue(q->qdisc); 750 if (skb) 751 goto deliver; 752 } 753 return NULL; 754 } 755 756 static void netem_reset(struct Qdisc *sch) 757 { 758 struct netem_sched_data *q = qdisc_priv(sch); 759 760 qdisc_reset_queue(sch); 761 tfifo_reset(sch); 762 if (q->qdisc) 763 qdisc_reset(q->qdisc); 764 qdisc_watchdog_cancel(&q->watchdog); 765 } 766 767 static void dist_free(struct disttable *d) 768 { 769 kvfree(d); 770 } 771 772 /* 773 * Distribution data is a variable size payload containing 774 * signed 16 bit values. 775 */ 776 777 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, 778 const struct nlattr *attr) 779 { 780 size_t n = nla_len(attr)/sizeof(__s16); 781 const __s16 *data = nla_data(attr); 782 spinlock_t *root_lock; 783 struct disttable *d; 784 int i; 785 786 if (!n || n > NETEM_DIST_MAX) 787 return -EINVAL; 788 789 d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); 790 if (!d) 791 return -ENOMEM; 792 793 d->size = n; 794 for (i = 0; i < n; i++) 795 d->table[i] = data[i]; 796 797 root_lock = qdisc_root_sleeping_lock(sch); 798 799 spin_lock_bh(root_lock); 800 swap(*tbl, d); 801 spin_unlock_bh(root_lock); 802 803 dist_free(d); 804 return 0; 805 } 806 807 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 808 { 809 const struct tc_netem_slot *c = nla_data(attr); 810 811 q->slot_config = *c; 812 if (q->slot_config.max_packets == 0) 813 q->slot_config.max_packets = INT_MAX; 814 if (q->slot_config.max_bytes == 0) 815 q->slot_config.max_bytes = INT_MAX; 816 817 /* capping dist_jitter to the range acceptable by tabledist() */ 818 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); 819 820 q->slot.packets_left = q->slot_config.max_packets; 821 q->slot.bytes_left = q->slot_config.max_bytes; 822 if (q->slot_config.min_delay | q->slot_config.max_delay | 823 q->slot_config.dist_jitter) 824 q->slot.slot_next = ktime_get_ns(); 825 else 826 q->slot.slot_next = 0; 827 } 828 829 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 830 { 831 const struct tc_netem_corr *c = nla_data(attr); 832 833 init_crandom(&q->delay_cor, c->delay_corr); 834 init_crandom(&q->loss_cor, c->loss_corr); 835 init_crandom(&q->dup_cor, c->dup_corr); 836 } 837 838 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 839 { 840 const struct tc_netem_reorder *r = nla_data(attr); 841 842 q->reorder = r->probability; 843 init_crandom(&q->reorder_cor, r->correlation); 844 } 845 846 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 847 { 848 const struct tc_netem_corrupt *r = nla_data(attr); 849 850 q->corrupt = r->probability; 851 init_crandom(&q->corrupt_cor, r->correlation); 852 } 853 854 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 855 { 856 const struct tc_netem_rate *r = nla_data(attr); 857 858 q->rate = r->rate; 859 q->packet_overhead = r->packet_overhead; 860 q->cell_size = r->cell_size; 861 q->cell_overhead = r->cell_overhead; 862 if (q->cell_size) 863 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 864 else 865 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 866 } 867 868 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 869 { 870 const struct nlattr *la; 871 int rem; 872 873 nla_for_each_nested(la, attr, rem) { 874 u16 type = nla_type(la); 875 876 switch (type) { 877 case NETEM_LOSS_GI: { 878 const struct tc_netem_gimodel *gi = nla_data(la); 879 880 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 881 pr_info("netem: incorrect gi model size\n"); 882 return -EINVAL; 883 } 884 885 q->loss_model = CLG_4_STATES; 886 887 q->clg.state = TX_IN_GAP_PERIOD; 888 q->clg.a1 = gi->p13; 889 q->clg.a2 = gi->p31; 890 q->clg.a3 = gi->p32; 891 q->clg.a4 = gi->p14; 892 q->clg.a5 = gi->p23; 893 break; 894 } 895 896 case NETEM_LOSS_GE: { 897 const struct tc_netem_gemodel *ge = nla_data(la); 898 899 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 900 pr_info("netem: incorrect ge model size\n"); 901 return -EINVAL; 902 } 903 904 q->loss_model = CLG_GILB_ELL; 905 q->clg.state = GOOD_STATE; 906 q->clg.a1 = ge->p; 907 q->clg.a2 = ge->r; 908 q->clg.a3 = ge->h; 909 q->clg.a4 = ge->k1; 910 break; 911 } 912 913 default: 914 pr_info("netem: unknown loss type %u\n", type); 915 return -EINVAL; 916 } 917 } 918 919 return 0; 920 } 921 922 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 923 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 924 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 925 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 926 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 927 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 928 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 929 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 930 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 931 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 932 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 933 }; 934 935 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 936 const struct nla_policy *policy, int len) 937 { 938 int nested_len = nla_len(nla) - NLA_ALIGN(len); 939 940 if (nested_len < 0) { 941 pr_info("netem: invalid attributes len %d\n", nested_len); 942 return -EINVAL; 943 } 944 945 if (nested_len >= nla_attr_size(0)) 946 return nla_parse_deprecated(tb, maxtype, 947 nla_data(nla) + NLA_ALIGN(len), 948 nested_len, policy, NULL); 949 950 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 951 return 0; 952 } 953 954 /* Parse netlink message to set options */ 955 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 956 struct netlink_ext_ack *extack) 957 { 958 struct netem_sched_data *q = qdisc_priv(sch); 959 struct nlattr *tb[TCA_NETEM_MAX + 1]; 960 struct tc_netem_qopt *qopt; 961 struct clgstate old_clg; 962 int old_loss_model = CLG_RANDOM; 963 int ret; 964 965 qopt = nla_data(opt); 966 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 967 if (ret < 0) 968 return ret; 969 970 sch_tree_lock(sch); 971 /* backup q->clg and q->loss_model */ 972 old_clg = q->clg; 973 old_loss_model = q->loss_model; 974 975 if (tb[TCA_NETEM_LOSS]) { 976 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 977 if (ret) { 978 q->loss_model = old_loss_model; 979 goto unlock; 980 } 981 } else { 982 q->loss_model = CLG_RANDOM; 983 } 984 985 if (tb[TCA_NETEM_DELAY_DIST]) { 986 ret = get_dist_table(sch, &q->delay_dist, 987 tb[TCA_NETEM_DELAY_DIST]); 988 if (ret) 989 goto get_table_failure; 990 } 991 992 if (tb[TCA_NETEM_SLOT_DIST]) { 993 ret = get_dist_table(sch, &q->slot_dist, 994 tb[TCA_NETEM_SLOT_DIST]); 995 if (ret) 996 goto get_table_failure; 997 } 998 999 sch->limit = qopt->limit; 1000 1001 q->latency = PSCHED_TICKS2NS(qopt->latency); 1002 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 1003 q->limit = qopt->limit; 1004 q->gap = qopt->gap; 1005 q->counter = 0; 1006 q->loss = qopt->loss; 1007 q->duplicate = qopt->duplicate; 1008 1009 /* for compatibility with earlier versions. 1010 * if gap is set, need to assume 100% probability 1011 */ 1012 if (q->gap) 1013 q->reorder = ~0; 1014 1015 if (tb[TCA_NETEM_CORR]) 1016 get_correlation(q, tb[TCA_NETEM_CORR]); 1017 1018 if (tb[TCA_NETEM_REORDER]) 1019 get_reorder(q, tb[TCA_NETEM_REORDER]); 1020 1021 if (tb[TCA_NETEM_CORRUPT]) 1022 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1023 1024 if (tb[TCA_NETEM_RATE]) 1025 get_rate(q, tb[TCA_NETEM_RATE]); 1026 1027 if (tb[TCA_NETEM_RATE64]) 1028 q->rate = max_t(u64, q->rate, 1029 nla_get_u64(tb[TCA_NETEM_RATE64])); 1030 1031 if (tb[TCA_NETEM_LATENCY64]) 1032 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1033 1034 if (tb[TCA_NETEM_JITTER64]) 1035 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1036 1037 if (tb[TCA_NETEM_ECN]) 1038 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1039 1040 if (tb[TCA_NETEM_SLOT]) 1041 get_slot(q, tb[TCA_NETEM_SLOT]); 1042 1043 /* capping jitter to the range acceptable by tabledist() */ 1044 q->jitter = min_t(s64, abs(q->jitter), INT_MAX); 1045 1046 unlock: 1047 sch_tree_unlock(sch); 1048 return ret; 1049 1050 get_table_failure: 1051 /* recover clg and loss_model, in case of 1052 * q->clg and q->loss_model were modified 1053 * in get_loss_clg() 1054 */ 1055 q->clg = old_clg; 1056 q->loss_model = old_loss_model; 1057 1058 goto unlock; 1059 } 1060 1061 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1062 struct netlink_ext_ack *extack) 1063 { 1064 struct netem_sched_data *q = qdisc_priv(sch); 1065 int ret; 1066 1067 qdisc_watchdog_init(&q->watchdog, sch); 1068 1069 if (!opt) 1070 return -EINVAL; 1071 1072 q->loss_model = CLG_RANDOM; 1073 ret = netem_change(sch, opt, extack); 1074 if (ret) 1075 pr_info("netem: change failed\n"); 1076 return ret; 1077 } 1078 1079 static void netem_destroy(struct Qdisc *sch) 1080 { 1081 struct netem_sched_data *q = qdisc_priv(sch); 1082 1083 qdisc_watchdog_cancel(&q->watchdog); 1084 if (q->qdisc) 1085 qdisc_put(q->qdisc); 1086 dist_free(q->delay_dist); 1087 dist_free(q->slot_dist); 1088 } 1089 1090 static int dump_loss_model(const struct netem_sched_data *q, 1091 struct sk_buff *skb) 1092 { 1093 struct nlattr *nest; 1094 1095 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); 1096 if (nest == NULL) 1097 goto nla_put_failure; 1098 1099 switch (q->loss_model) { 1100 case CLG_RANDOM: 1101 /* legacy loss model */ 1102 nla_nest_cancel(skb, nest); 1103 return 0; /* no data */ 1104 1105 case CLG_4_STATES: { 1106 struct tc_netem_gimodel gi = { 1107 .p13 = q->clg.a1, 1108 .p31 = q->clg.a2, 1109 .p32 = q->clg.a3, 1110 .p14 = q->clg.a4, 1111 .p23 = q->clg.a5, 1112 }; 1113 1114 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1115 goto nla_put_failure; 1116 break; 1117 } 1118 case CLG_GILB_ELL: { 1119 struct tc_netem_gemodel ge = { 1120 .p = q->clg.a1, 1121 .r = q->clg.a2, 1122 .h = q->clg.a3, 1123 .k1 = q->clg.a4, 1124 }; 1125 1126 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1127 goto nla_put_failure; 1128 break; 1129 } 1130 } 1131 1132 nla_nest_end(skb, nest); 1133 return 0; 1134 1135 nla_put_failure: 1136 nla_nest_cancel(skb, nest); 1137 return -1; 1138 } 1139 1140 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1141 { 1142 const struct netem_sched_data *q = qdisc_priv(sch); 1143 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1144 struct tc_netem_qopt qopt; 1145 struct tc_netem_corr cor; 1146 struct tc_netem_reorder reorder; 1147 struct tc_netem_corrupt corrupt; 1148 struct tc_netem_rate rate; 1149 struct tc_netem_slot slot; 1150 1151 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), 1152 UINT_MAX); 1153 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), 1154 UINT_MAX); 1155 qopt.limit = q->limit; 1156 qopt.loss = q->loss; 1157 qopt.gap = q->gap; 1158 qopt.duplicate = q->duplicate; 1159 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1160 goto nla_put_failure; 1161 1162 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1163 goto nla_put_failure; 1164 1165 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1166 goto nla_put_failure; 1167 1168 cor.delay_corr = q->delay_cor.rho; 1169 cor.loss_corr = q->loss_cor.rho; 1170 cor.dup_corr = q->dup_cor.rho; 1171 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1172 goto nla_put_failure; 1173 1174 reorder.probability = q->reorder; 1175 reorder.correlation = q->reorder_cor.rho; 1176 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1177 goto nla_put_failure; 1178 1179 corrupt.probability = q->corrupt; 1180 corrupt.correlation = q->corrupt_cor.rho; 1181 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1182 goto nla_put_failure; 1183 1184 if (q->rate >= (1ULL << 32)) { 1185 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1186 TCA_NETEM_PAD)) 1187 goto nla_put_failure; 1188 rate.rate = ~0U; 1189 } else { 1190 rate.rate = q->rate; 1191 } 1192 rate.packet_overhead = q->packet_overhead; 1193 rate.cell_size = q->cell_size; 1194 rate.cell_overhead = q->cell_overhead; 1195 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1196 goto nla_put_failure; 1197 1198 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1199 goto nla_put_failure; 1200 1201 if (dump_loss_model(q, skb) != 0) 1202 goto nla_put_failure; 1203 1204 if (q->slot_config.min_delay | q->slot_config.max_delay | 1205 q->slot_config.dist_jitter) { 1206 slot = q->slot_config; 1207 if (slot.max_packets == INT_MAX) 1208 slot.max_packets = 0; 1209 if (slot.max_bytes == INT_MAX) 1210 slot.max_bytes = 0; 1211 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1212 goto nla_put_failure; 1213 } 1214 1215 return nla_nest_end(skb, nla); 1216 1217 nla_put_failure: 1218 nlmsg_trim(skb, nla); 1219 return -1; 1220 } 1221 1222 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1223 struct sk_buff *skb, struct tcmsg *tcm) 1224 { 1225 struct netem_sched_data *q = qdisc_priv(sch); 1226 1227 if (cl != 1 || !q->qdisc) /* only one class */ 1228 return -ENOENT; 1229 1230 tcm->tcm_handle |= TC_H_MIN(1); 1231 tcm->tcm_info = q->qdisc->handle; 1232 1233 return 0; 1234 } 1235 1236 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1237 struct Qdisc **old, struct netlink_ext_ack *extack) 1238 { 1239 struct netem_sched_data *q = qdisc_priv(sch); 1240 1241 *old = qdisc_replace(sch, new, &q->qdisc); 1242 return 0; 1243 } 1244 1245 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1246 { 1247 struct netem_sched_data *q = qdisc_priv(sch); 1248 return q->qdisc; 1249 } 1250 1251 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1252 { 1253 return 1; 1254 } 1255 1256 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1257 { 1258 if (!walker->stop) { 1259 if (!tc_qdisc_stats_dump(sch, 1, walker)) 1260 return; 1261 } 1262 } 1263 1264 static const struct Qdisc_class_ops netem_class_ops = { 1265 .graft = netem_graft, 1266 .leaf = netem_leaf, 1267 .find = netem_find, 1268 .walk = netem_walk, 1269 .dump = netem_dump_class, 1270 }; 1271 1272 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1273 .id = "netem", 1274 .cl_ops = &netem_class_ops, 1275 .priv_size = sizeof(struct netem_sched_data), 1276 .enqueue = netem_enqueue, 1277 .dequeue = netem_dequeue, 1278 .peek = qdisc_peek_dequeued, 1279 .init = netem_init, 1280 .reset = netem_reset, 1281 .destroy = netem_destroy, 1282 .change = netem_change, 1283 .dump = netem_dump, 1284 .owner = THIS_MODULE, 1285 }; 1286 1287 1288 static int __init netem_module_init(void) 1289 { 1290 pr_info("netem: version " VERSION "\n"); 1291 return register_qdisc(&netem_qdisc_ops); 1292 } 1293 static void __exit netem_module_exit(void) 1294 { 1295 unregister_qdisc(&netem_qdisc_ops); 1296 } 1297 module_init(netem_module_init) 1298 module_exit(netem_module_exit) 1299 MODULE_LICENSE("GPL"); 1300