1 /* 2 * net/sched/sch_netem.c Network emulator 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License. 8 * 9 * Many of the algorithms and ideas for this came from 10 * NIST Net which is not copyrighted. 11 * 12 * Authors: Stephen Hemminger <shemminger@osdl.org> 13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/errno.h> 22 #include <linux/skbuff.h> 23 #include <linux/vmalloc.h> 24 #include <linux/rtnetlink.h> 25 #include <linux/reciprocal_div.h> 26 #include <linux/rbtree.h> 27 28 #include <net/netlink.h> 29 #include <net/pkt_sched.h> 30 #include <net/inet_ecn.h> 31 32 #define VERSION "1.3" 33 34 /* Network Emulation Queuing algorithm. 35 ==================================== 36 37 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 38 Network Emulation Tool 39 [2] Luigi Rizzo, DummyNet for FreeBSD 40 41 ---------------------------------------------------------------- 42 43 This started out as a simple way to delay outgoing packets to 44 test TCP but has grown to include most of the functionality 45 of a full blown network emulator like NISTnet. It can delay 46 packets and add random jitter (and correlation). The random 47 distribution can be loaded from a table as well to provide 48 normal, Pareto, or experimental curves. Packet loss, 49 duplication, and reordering can also be emulated. 50 51 This qdisc does not do classification that can be handled in 52 layering other disciplines. It does not need to do bandwidth 53 control either since that can be handled by using token 54 bucket or other rate control. 55 56 Correlated Loss Generator models 57 58 Added generation of correlated loss according to the 59 "Gilbert-Elliot" model, a 4-state markov model. 60 61 References: 62 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 63 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 64 and intuitive loss model for packet networks and its implementation 65 in the Netem module in the Linux kernel", available in [1] 66 67 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 68 Fabio Ludovici <fabio.ludovici at yahoo.it> 69 */ 70 71 struct disttable { 72 u32 size; 73 s16 table[0]; 74 }; 75 76 struct netem_sched_data { 77 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 78 struct rb_root t_root; 79 80 /* a linear queue; reduces rbtree rebalancing when jitter is low */ 81 struct sk_buff *t_head; 82 struct sk_buff *t_tail; 83 84 /* optional qdisc for classful handling (NULL at netem init) */ 85 struct Qdisc *qdisc; 86 87 struct qdisc_watchdog watchdog; 88 89 s64 latency; 90 s64 jitter; 91 92 u32 loss; 93 u32 ecn; 94 u32 limit; 95 u32 counter; 96 u32 gap; 97 u32 duplicate; 98 u32 reorder; 99 u32 corrupt; 100 u64 rate; 101 s32 packet_overhead; 102 u32 cell_size; 103 struct reciprocal_value cell_size_reciprocal; 104 s32 cell_overhead; 105 106 struct crndstate { 107 u32 last; 108 u32 rho; 109 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 110 111 struct disttable *delay_dist; 112 113 enum { 114 CLG_RANDOM, 115 CLG_4_STATES, 116 CLG_GILB_ELL, 117 } loss_model; 118 119 enum { 120 TX_IN_GAP_PERIOD = 1, 121 TX_IN_BURST_PERIOD, 122 LOST_IN_GAP_PERIOD, 123 LOST_IN_BURST_PERIOD, 124 } _4_state_model; 125 126 enum { 127 GOOD_STATE = 1, 128 BAD_STATE, 129 } GE_state_model; 130 131 /* Correlated Loss Generation models */ 132 struct clgstate { 133 /* state of the Markov chain */ 134 u8 state; 135 136 /* 4-states and Gilbert-Elliot models */ 137 u32 a1; /* p13 for 4-states or p for GE */ 138 u32 a2; /* p31 for 4-states or r for GE */ 139 u32 a3; /* p32 for 4-states or h for GE */ 140 u32 a4; /* p14 for 4-states or 1-k for GE */ 141 u32 a5; /* p23 used only in 4-states */ 142 } clg; 143 144 struct tc_netem_slot slot_config; 145 struct slotstate { 146 u64 slot_next; 147 s32 packets_left; 148 s32 bytes_left; 149 } slot; 150 151 struct disttable *slot_dist; 152 }; 153 154 /* Time stamp put into socket buffer control block 155 * Only valid when skbs are in our internal t(ime)fifo queue. 156 * 157 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 158 * and skb->next & skb->prev are scratch space for a qdisc, 159 * we save skb->tstamp value in skb->cb[] before destroying it. 160 */ 161 struct netem_skb_cb { 162 u64 time_to_send; 163 }; 164 165 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 166 { 167 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 168 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 169 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 170 } 171 172 /* init_crandom - initialize correlated random number generator 173 * Use entropy source for initial seed. 174 */ 175 static void init_crandom(struct crndstate *state, unsigned long rho) 176 { 177 state->rho = rho; 178 state->last = prandom_u32(); 179 } 180 181 /* get_crandom - correlated random number generator 182 * Next number depends on last value. 183 * rho is scaled to avoid floating point. 184 */ 185 static u32 get_crandom(struct crndstate *state) 186 { 187 u64 value, rho; 188 unsigned long answer; 189 190 if (!state || state->rho == 0) /* no correlation */ 191 return prandom_u32(); 192 193 value = prandom_u32(); 194 rho = (u64)state->rho + 1; 195 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 196 state->last = answer; 197 return answer; 198 } 199 200 /* loss_4state - 4-state model loss generator 201 * Generates losses according to the 4-state Markov chain adopted in 202 * the GI (General and Intuitive) loss model. 203 */ 204 static bool loss_4state(struct netem_sched_data *q) 205 { 206 struct clgstate *clg = &q->clg; 207 u32 rnd = prandom_u32(); 208 209 /* 210 * Makes a comparison between rnd and the transition 211 * probabilities outgoing from the current state, then decides the 212 * next state and if the next packet has to be transmitted or lost. 213 * The four states correspond to: 214 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 215 * LOST_IN_BURST_PERIOD => isolated losses within a gap period 216 * LOST_IN_GAP_PERIOD => lost packets within a burst period 217 * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period 218 */ 219 switch (clg->state) { 220 case TX_IN_GAP_PERIOD: 221 if (rnd < clg->a4) { 222 clg->state = LOST_IN_BURST_PERIOD; 223 return true; 224 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 225 clg->state = LOST_IN_GAP_PERIOD; 226 return true; 227 } else if (clg->a1 + clg->a4 < rnd) { 228 clg->state = TX_IN_GAP_PERIOD; 229 } 230 231 break; 232 case TX_IN_BURST_PERIOD: 233 if (rnd < clg->a5) { 234 clg->state = LOST_IN_GAP_PERIOD; 235 return true; 236 } else { 237 clg->state = TX_IN_BURST_PERIOD; 238 } 239 240 break; 241 case LOST_IN_GAP_PERIOD: 242 if (rnd < clg->a3) 243 clg->state = TX_IN_BURST_PERIOD; 244 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 245 clg->state = TX_IN_GAP_PERIOD; 246 } else if (clg->a2 + clg->a3 < rnd) { 247 clg->state = LOST_IN_GAP_PERIOD; 248 return true; 249 } 250 break; 251 case LOST_IN_BURST_PERIOD: 252 clg->state = TX_IN_GAP_PERIOD; 253 break; 254 } 255 256 return false; 257 } 258 259 /* loss_gilb_ell - Gilbert-Elliot model loss generator 260 * Generates losses according to the Gilbert-Elliot loss model or 261 * its special cases (Gilbert or Simple Gilbert) 262 * 263 * Makes a comparison between random number and the transition 264 * probabilities outgoing from the current state, then decides the 265 * next state. A second random number is extracted and the comparison 266 * with the loss probability of the current state decides if the next 267 * packet will be transmitted or lost. 268 */ 269 static bool loss_gilb_ell(struct netem_sched_data *q) 270 { 271 struct clgstate *clg = &q->clg; 272 273 switch (clg->state) { 274 case GOOD_STATE: 275 if (prandom_u32() < clg->a1) 276 clg->state = BAD_STATE; 277 if (prandom_u32() < clg->a4) 278 return true; 279 break; 280 case BAD_STATE: 281 if (prandom_u32() < clg->a2) 282 clg->state = GOOD_STATE; 283 if (prandom_u32() > clg->a3) 284 return true; 285 } 286 287 return false; 288 } 289 290 static bool loss_event(struct netem_sched_data *q) 291 { 292 switch (q->loss_model) { 293 case CLG_RANDOM: 294 /* Random packet drop 0 => none, ~0 => all */ 295 return q->loss && q->loss >= get_crandom(&q->loss_cor); 296 297 case CLG_4_STATES: 298 /* 4state loss model algorithm (used also for GI model) 299 * Extracts a value from the markov 4 state loss generator, 300 * if it is 1 drops a packet and if needed writes the event in 301 * the kernel logs 302 */ 303 return loss_4state(q); 304 305 case CLG_GILB_ELL: 306 /* Gilbert-Elliot loss model algorithm 307 * Extracts a value from the Gilbert-Elliot loss generator, 308 * if it is 1 drops a packet and if needed writes the event in 309 * the kernel logs 310 */ 311 return loss_gilb_ell(q); 312 } 313 314 return false; /* not reached */ 315 } 316 317 318 /* tabledist - return a pseudo-randomly distributed value with mean mu and 319 * std deviation sigma. Uses table lookup to approximate the desired 320 * distribution, and a uniformly-distributed pseudo-random source. 321 */ 322 static s64 tabledist(s64 mu, s32 sigma, 323 struct crndstate *state, 324 const struct disttable *dist) 325 { 326 s64 x; 327 long t; 328 u32 rnd; 329 330 if (sigma == 0) 331 return mu; 332 333 rnd = get_crandom(state); 334 335 /* default uniform distribution */ 336 if (dist == NULL) 337 return ((rnd % (2 * sigma)) + mu) - sigma; 338 339 t = dist->table[rnd % dist->size]; 340 x = (sigma % NETEM_DIST_SCALE) * t; 341 if (x >= 0) 342 x += NETEM_DIST_SCALE/2; 343 else 344 x -= NETEM_DIST_SCALE/2; 345 346 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 347 } 348 349 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) 350 { 351 len += q->packet_overhead; 352 353 if (q->cell_size) { 354 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 355 356 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 357 cells++; 358 len = cells * (q->cell_size + q->cell_overhead); 359 } 360 361 return div64_u64(len * NSEC_PER_SEC, q->rate); 362 } 363 364 static void tfifo_reset(struct Qdisc *sch) 365 { 366 struct netem_sched_data *q = qdisc_priv(sch); 367 struct rb_node *p = rb_first(&q->t_root); 368 369 while (p) { 370 struct sk_buff *skb = rb_to_skb(p); 371 372 p = rb_next(p); 373 rb_erase(&skb->rbnode, &q->t_root); 374 rtnl_kfree_skbs(skb, skb); 375 } 376 377 rtnl_kfree_skbs(q->t_head, q->t_tail); 378 q->t_head = NULL; 379 q->t_tail = NULL; 380 } 381 382 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 383 { 384 struct netem_sched_data *q = qdisc_priv(sch); 385 u64 tnext = netem_skb_cb(nskb)->time_to_send; 386 387 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { 388 if (q->t_tail) 389 q->t_tail->next = nskb; 390 else 391 q->t_head = nskb; 392 q->t_tail = nskb; 393 } else { 394 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 395 396 while (*p) { 397 struct sk_buff *skb; 398 399 parent = *p; 400 skb = rb_to_skb(parent); 401 if (tnext >= netem_skb_cb(skb)->time_to_send) 402 p = &parent->rb_right; 403 else 404 p = &parent->rb_left; 405 } 406 rb_link_node(&nskb->rbnode, parent, p); 407 rb_insert_color(&nskb->rbnode, &q->t_root); 408 } 409 sch->q.qlen++; 410 } 411 412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 413 * when we statistically choose to corrupt one, we instead segment it, returning 414 * the first packet to be corrupted, and re-enqueue the remaining frames 415 */ 416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 417 struct sk_buff **to_free) 418 { 419 struct sk_buff *segs; 420 netdev_features_t features = netif_skb_features(skb); 421 422 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 423 424 if (IS_ERR_OR_NULL(segs)) { 425 qdisc_drop(skb, sch, to_free); 426 return NULL; 427 } 428 consume_skb(skb); 429 return segs; 430 } 431 432 /* 433 * Insert one skb into qdisc. 434 * Note: parent depends on return value to account for queue length. 435 * NET_XMIT_DROP: queue length didn't change. 436 * NET_XMIT_SUCCESS: one skb was queued. 437 */ 438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 439 struct sk_buff **to_free) 440 { 441 struct netem_sched_data *q = qdisc_priv(sch); 442 /* We don't fill cb now as skb_unshare() may invalidate it */ 443 struct netem_skb_cb *cb; 444 struct sk_buff *skb2; 445 struct sk_buff *segs = NULL; 446 unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); 447 int nb = 0; 448 int count = 1; 449 int rc = NET_XMIT_SUCCESS; 450 451 /* Do not fool qdisc_drop_all() */ 452 skb->prev = NULL; 453 454 /* Random duplication */ 455 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 456 ++count; 457 458 /* Drop packet? */ 459 if (loss_event(q)) { 460 if (q->ecn && INET_ECN_set_ce(skb)) 461 qdisc_qstats_drop(sch); /* mark packet */ 462 else 463 --count; 464 } 465 if (count == 0) { 466 qdisc_qstats_drop(sch); 467 __qdisc_drop(skb, to_free); 468 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 469 } 470 471 /* If a delay is expected, orphan the skb. (orphaning usually takes 472 * place at TX completion time, so _before_ the link transit delay) 473 */ 474 if (q->latency || q->jitter || q->rate) 475 skb_orphan_partial(skb); 476 477 /* 478 * If we need to duplicate packet, then re-insert at top of the 479 * qdisc tree, since parent queuer expects that only one 480 * skb will be queued. 481 */ 482 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 483 struct Qdisc *rootq = qdisc_root(sch); 484 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 485 486 q->duplicate = 0; 487 rootq->enqueue(skb2, rootq, to_free); 488 q->duplicate = dupsave; 489 } 490 491 /* 492 * Randomized packet corruption. 493 * Make copy if needed since we are modifying 494 * If packet is going to be hardware checksummed, then 495 * do it now in software before we mangle it. 496 */ 497 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 498 if (skb_is_gso(skb)) { 499 segs = netem_segment(skb, sch, to_free); 500 if (!segs) 501 return NET_XMIT_DROP; 502 } else { 503 segs = skb; 504 } 505 506 skb = segs; 507 segs = segs->next; 508 509 skb = skb_unshare(skb, GFP_ATOMIC); 510 if (unlikely(!skb)) { 511 qdisc_qstats_drop(sch); 512 goto finish_segs; 513 } 514 if (skb->ip_summed == CHECKSUM_PARTIAL && 515 skb_checksum_help(skb)) { 516 qdisc_drop(skb, sch, to_free); 517 goto finish_segs; 518 } 519 520 skb->data[prandom_u32() % skb_headlen(skb)] ^= 521 1<<(prandom_u32() % 8); 522 } 523 524 if (unlikely(sch->q.qlen >= sch->limit)) 525 return qdisc_drop_all(skb, sch, to_free); 526 527 qdisc_qstats_backlog_inc(sch, skb); 528 529 cb = netem_skb_cb(skb); 530 if (q->gap == 0 || /* not doing reordering */ 531 q->counter < q->gap - 1 || /* inside last reordering gap */ 532 q->reorder < get_crandom(&q->reorder_cor)) { 533 u64 now; 534 s64 delay; 535 536 delay = tabledist(q->latency, q->jitter, 537 &q->delay_cor, q->delay_dist); 538 539 now = ktime_get_ns(); 540 541 if (q->rate) { 542 struct netem_skb_cb *last = NULL; 543 544 if (sch->q.tail) 545 last = netem_skb_cb(sch->q.tail); 546 if (q->t_root.rb_node) { 547 struct sk_buff *t_skb; 548 struct netem_skb_cb *t_last; 549 550 t_skb = skb_rb_last(&q->t_root); 551 t_last = netem_skb_cb(t_skb); 552 if (!last || 553 t_last->time_to_send > last->time_to_send) 554 last = t_last; 555 } 556 if (q->t_tail) { 557 struct netem_skb_cb *t_last = 558 netem_skb_cb(q->t_tail); 559 560 if (!last || 561 t_last->time_to_send > last->time_to_send) 562 last = t_last; 563 } 564 565 if (last) { 566 /* 567 * Last packet in queue is reference point (now), 568 * calculate this time bonus and subtract 569 * from delay. 570 */ 571 delay -= last->time_to_send - now; 572 delay = max_t(s64, 0, delay); 573 now = last->time_to_send; 574 } 575 576 delay += packet_time_ns(qdisc_pkt_len(skb), q); 577 } 578 579 cb->time_to_send = now + delay; 580 ++q->counter; 581 tfifo_enqueue(skb, sch); 582 } else { 583 /* 584 * Do re-ordering by putting one out of N packets at the front 585 * of the queue. 586 */ 587 cb->time_to_send = ktime_get_ns(); 588 q->counter = 0; 589 590 __qdisc_enqueue_head(skb, &sch->q); 591 sch->qstats.requeues++; 592 } 593 594 finish_segs: 595 if (segs) { 596 while (segs) { 597 skb2 = segs->next; 598 skb_mark_not_on_list(segs); 599 qdisc_skb_cb(segs)->pkt_len = segs->len; 600 last_len = segs->len; 601 rc = qdisc_enqueue(segs, sch, to_free); 602 if (rc != NET_XMIT_SUCCESS) { 603 if (net_xmit_drop_count(rc)) 604 qdisc_qstats_drop(sch); 605 } else { 606 nb++; 607 len += last_len; 608 } 609 segs = skb2; 610 } 611 sch->q.qlen += nb; 612 if (nb > 1) 613 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); 614 } 615 return NET_XMIT_SUCCESS; 616 } 617 618 /* Delay the next round with a new future slot with a 619 * correct number of bytes and packets. 620 */ 621 622 static void get_slot_next(struct netem_sched_data *q, u64 now) 623 { 624 s64 next_delay; 625 626 if (!q->slot_dist) 627 next_delay = q->slot_config.min_delay + 628 (prandom_u32() * 629 (q->slot_config.max_delay - 630 q->slot_config.min_delay) >> 32); 631 else 632 next_delay = tabledist(q->slot_config.dist_delay, 633 (s32)(q->slot_config.dist_jitter), 634 NULL, q->slot_dist); 635 636 q->slot.slot_next = now + next_delay; 637 q->slot.packets_left = q->slot_config.max_packets; 638 q->slot.bytes_left = q->slot_config.max_bytes; 639 } 640 641 static struct sk_buff *netem_peek(struct netem_sched_data *q) 642 { 643 struct sk_buff *skb = skb_rb_first(&q->t_root); 644 u64 t1, t2; 645 646 if (!skb) 647 return q->t_head; 648 if (!q->t_head) 649 return skb; 650 651 t1 = netem_skb_cb(skb)->time_to_send; 652 t2 = netem_skb_cb(q->t_head)->time_to_send; 653 if (t1 < t2) 654 return skb; 655 return q->t_head; 656 } 657 658 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) 659 { 660 if (skb == q->t_head) { 661 q->t_head = skb->next; 662 if (!q->t_head) 663 q->t_tail = NULL; 664 } else { 665 rb_erase(&skb->rbnode, &q->t_root); 666 } 667 } 668 669 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 670 { 671 struct netem_sched_data *q = qdisc_priv(sch); 672 struct sk_buff *skb; 673 674 tfifo_dequeue: 675 skb = __qdisc_dequeue_head(&sch->q); 676 if (skb) { 677 qdisc_qstats_backlog_dec(sch, skb); 678 deliver: 679 qdisc_bstats_update(sch, skb); 680 return skb; 681 } 682 skb = netem_peek(q); 683 if (skb) { 684 u64 time_to_send; 685 u64 now = ktime_get_ns(); 686 687 /* if more time remaining? */ 688 time_to_send = netem_skb_cb(skb)->time_to_send; 689 if (q->slot.slot_next && q->slot.slot_next < time_to_send) 690 get_slot_next(q, now); 691 692 if (time_to_send <= now && q->slot.slot_next <= now) { 693 netem_erase_head(q, skb); 694 sch->q.qlen--; 695 qdisc_qstats_backlog_dec(sch, skb); 696 skb->next = NULL; 697 skb->prev = NULL; 698 /* skb->dev shares skb->rbnode area, 699 * we need to restore its value. 700 */ 701 skb->dev = qdisc_dev(sch); 702 703 if (q->slot.slot_next) { 704 q->slot.packets_left--; 705 q->slot.bytes_left -= qdisc_pkt_len(skb); 706 if (q->slot.packets_left <= 0 || 707 q->slot.bytes_left <= 0) 708 get_slot_next(q, now); 709 } 710 711 if (q->qdisc) { 712 unsigned int pkt_len = qdisc_pkt_len(skb); 713 struct sk_buff *to_free = NULL; 714 int err; 715 716 err = qdisc_enqueue(skb, q->qdisc, &to_free); 717 kfree_skb_list(to_free); 718 if (err != NET_XMIT_SUCCESS && 719 net_xmit_drop_count(err)) { 720 qdisc_qstats_drop(sch); 721 qdisc_tree_reduce_backlog(sch, 1, 722 pkt_len); 723 } 724 goto tfifo_dequeue; 725 } 726 goto deliver; 727 } 728 729 if (q->qdisc) { 730 skb = q->qdisc->ops->dequeue(q->qdisc); 731 if (skb) 732 goto deliver; 733 } 734 735 qdisc_watchdog_schedule_ns(&q->watchdog, 736 max(time_to_send, 737 q->slot.slot_next)); 738 } 739 740 if (q->qdisc) { 741 skb = q->qdisc->ops->dequeue(q->qdisc); 742 if (skb) 743 goto deliver; 744 } 745 return NULL; 746 } 747 748 static void netem_reset(struct Qdisc *sch) 749 { 750 struct netem_sched_data *q = qdisc_priv(sch); 751 752 qdisc_reset_queue(sch); 753 tfifo_reset(sch); 754 if (q->qdisc) 755 qdisc_reset(q->qdisc); 756 qdisc_watchdog_cancel(&q->watchdog); 757 } 758 759 static void dist_free(struct disttable *d) 760 { 761 kvfree(d); 762 } 763 764 /* 765 * Distribution data is a variable size payload containing 766 * signed 16 bit values. 767 */ 768 769 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, 770 const struct nlattr *attr) 771 { 772 size_t n = nla_len(attr)/sizeof(__s16); 773 const __s16 *data = nla_data(attr); 774 spinlock_t *root_lock; 775 struct disttable *d; 776 int i; 777 778 if (n > NETEM_DIST_MAX) 779 return -EINVAL; 780 781 d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); 782 if (!d) 783 return -ENOMEM; 784 785 d->size = n; 786 for (i = 0; i < n; i++) 787 d->table[i] = data[i]; 788 789 root_lock = qdisc_root_sleeping_lock(sch); 790 791 spin_lock_bh(root_lock); 792 swap(*tbl, d); 793 spin_unlock_bh(root_lock); 794 795 dist_free(d); 796 return 0; 797 } 798 799 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) 800 { 801 const struct tc_netem_slot *c = nla_data(attr); 802 803 q->slot_config = *c; 804 if (q->slot_config.max_packets == 0) 805 q->slot_config.max_packets = INT_MAX; 806 if (q->slot_config.max_bytes == 0) 807 q->slot_config.max_bytes = INT_MAX; 808 q->slot.packets_left = q->slot_config.max_packets; 809 q->slot.bytes_left = q->slot_config.max_bytes; 810 if (q->slot_config.min_delay | q->slot_config.max_delay | 811 q->slot_config.dist_jitter) 812 q->slot.slot_next = ktime_get_ns(); 813 else 814 q->slot.slot_next = 0; 815 } 816 817 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 818 { 819 const struct tc_netem_corr *c = nla_data(attr); 820 821 init_crandom(&q->delay_cor, c->delay_corr); 822 init_crandom(&q->loss_cor, c->loss_corr); 823 init_crandom(&q->dup_cor, c->dup_corr); 824 } 825 826 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 827 { 828 const struct tc_netem_reorder *r = nla_data(attr); 829 830 q->reorder = r->probability; 831 init_crandom(&q->reorder_cor, r->correlation); 832 } 833 834 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 835 { 836 const struct tc_netem_corrupt *r = nla_data(attr); 837 838 q->corrupt = r->probability; 839 init_crandom(&q->corrupt_cor, r->correlation); 840 } 841 842 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 843 { 844 const struct tc_netem_rate *r = nla_data(attr); 845 846 q->rate = r->rate; 847 q->packet_overhead = r->packet_overhead; 848 q->cell_size = r->cell_size; 849 q->cell_overhead = r->cell_overhead; 850 if (q->cell_size) 851 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 852 else 853 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 854 } 855 856 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 857 { 858 const struct nlattr *la; 859 int rem; 860 861 nla_for_each_nested(la, attr, rem) { 862 u16 type = nla_type(la); 863 864 switch (type) { 865 case NETEM_LOSS_GI: { 866 const struct tc_netem_gimodel *gi = nla_data(la); 867 868 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 869 pr_info("netem: incorrect gi model size\n"); 870 return -EINVAL; 871 } 872 873 q->loss_model = CLG_4_STATES; 874 875 q->clg.state = TX_IN_GAP_PERIOD; 876 q->clg.a1 = gi->p13; 877 q->clg.a2 = gi->p31; 878 q->clg.a3 = gi->p32; 879 q->clg.a4 = gi->p14; 880 q->clg.a5 = gi->p23; 881 break; 882 } 883 884 case NETEM_LOSS_GE: { 885 const struct tc_netem_gemodel *ge = nla_data(la); 886 887 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 888 pr_info("netem: incorrect ge model size\n"); 889 return -EINVAL; 890 } 891 892 q->loss_model = CLG_GILB_ELL; 893 q->clg.state = GOOD_STATE; 894 q->clg.a1 = ge->p; 895 q->clg.a2 = ge->r; 896 q->clg.a3 = ge->h; 897 q->clg.a4 = ge->k1; 898 break; 899 } 900 901 default: 902 pr_info("netem: unknown loss type %u\n", type); 903 return -EINVAL; 904 } 905 } 906 907 return 0; 908 } 909 910 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 911 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 912 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 913 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 914 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 915 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 916 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 917 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 918 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, 919 [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, 920 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, 921 }; 922 923 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 924 const struct nla_policy *policy, int len) 925 { 926 int nested_len = nla_len(nla) - NLA_ALIGN(len); 927 928 if (nested_len < 0) { 929 pr_info("netem: invalid attributes len %d\n", nested_len); 930 return -EINVAL; 931 } 932 933 if (nested_len >= nla_attr_size(0)) 934 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), 935 nested_len, policy, NULL); 936 937 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 938 return 0; 939 } 940 941 /* Parse netlink message to set options */ 942 static int netem_change(struct Qdisc *sch, struct nlattr *opt, 943 struct netlink_ext_ack *extack) 944 { 945 struct netem_sched_data *q = qdisc_priv(sch); 946 struct nlattr *tb[TCA_NETEM_MAX + 1]; 947 struct tc_netem_qopt *qopt; 948 struct clgstate old_clg; 949 int old_loss_model = CLG_RANDOM; 950 int ret; 951 952 if (opt == NULL) 953 return -EINVAL; 954 955 qopt = nla_data(opt); 956 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 957 if (ret < 0) 958 return ret; 959 960 /* backup q->clg and q->loss_model */ 961 old_clg = q->clg; 962 old_loss_model = q->loss_model; 963 964 if (tb[TCA_NETEM_LOSS]) { 965 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 966 if (ret) { 967 q->loss_model = old_loss_model; 968 return ret; 969 } 970 } else { 971 q->loss_model = CLG_RANDOM; 972 } 973 974 if (tb[TCA_NETEM_DELAY_DIST]) { 975 ret = get_dist_table(sch, &q->delay_dist, 976 tb[TCA_NETEM_DELAY_DIST]); 977 if (ret) 978 goto get_table_failure; 979 } 980 981 if (tb[TCA_NETEM_SLOT_DIST]) { 982 ret = get_dist_table(sch, &q->slot_dist, 983 tb[TCA_NETEM_SLOT_DIST]); 984 if (ret) 985 goto get_table_failure; 986 } 987 988 sch->limit = qopt->limit; 989 990 q->latency = PSCHED_TICKS2NS(qopt->latency); 991 q->jitter = PSCHED_TICKS2NS(qopt->jitter); 992 q->limit = qopt->limit; 993 q->gap = qopt->gap; 994 q->counter = 0; 995 q->loss = qopt->loss; 996 q->duplicate = qopt->duplicate; 997 998 /* for compatibility with earlier versions. 999 * if gap is set, need to assume 100% probability 1000 */ 1001 if (q->gap) 1002 q->reorder = ~0; 1003 1004 if (tb[TCA_NETEM_CORR]) 1005 get_correlation(q, tb[TCA_NETEM_CORR]); 1006 1007 if (tb[TCA_NETEM_REORDER]) 1008 get_reorder(q, tb[TCA_NETEM_REORDER]); 1009 1010 if (tb[TCA_NETEM_CORRUPT]) 1011 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 1012 1013 if (tb[TCA_NETEM_RATE]) 1014 get_rate(q, tb[TCA_NETEM_RATE]); 1015 1016 if (tb[TCA_NETEM_RATE64]) 1017 q->rate = max_t(u64, q->rate, 1018 nla_get_u64(tb[TCA_NETEM_RATE64])); 1019 1020 if (tb[TCA_NETEM_LATENCY64]) 1021 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); 1022 1023 if (tb[TCA_NETEM_JITTER64]) 1024 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); 1025 1026 if (tb[TCA_NETEM_ECN]) 1027 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 1028 1029 if (tb[TCA_NETEM_SLOT]) 1030 get_slot(q, tb[TCA_NETEM_SLOT]); 1031 1032 return ret; 1033 1034 get_table_failure: 1035 /* recover clg and loss_model, in case of 1036 * q->clg and q->loss_model were modified 1037 * in get_loss_clg() 1038 */ 1039 q->clg = old_clg; 1040 q->loss_model = old_loss_model; 1041 return ret; 1042 } 1043 1044 static int netem_init(struct Qdisc *sch, struct nlattr *opt, 1045 struct netlink_ext_ack *extack) 1046 { 1047 struct netem_sched_data *q = qdisc_priv(sch); 1048 int ret; 1049 1050 qdisc_watchdog_init(&q->watchdog, sch); 1051 1052 if (!opt) 1053 return -EINVAL; 1054 1055 q->loss_model = CLG_RANDOM; 1056 ret = netem_change(sch, opt, extack); 1057 if (ret) 1058 pr_info("netem: change failed\n"); 1059 return ret; 1060 } 1061 1062 static void netem_destroy(struct Qdisc *sch) 1063 { 1064 struct netem_sched_data *q = qdisc_priv(sch); 1065 1066 qdisc_watchdog_cancel(&q->watchdog); 1067 if (q->qdisc) 1068 qdisc_put(q->qdisc); 1069 dist_free(q->delay_dist); 1070 dist_free(q->slot_dist); 1071 } 1072 1073 static int dump_loss_model(const struct netem_sched_data *q, 1074 struct sk_buff *skb) 1075 { 1076 struct nlattr *nest; 1077 1078 nest = nla_nest_start(skb, TCA_NETEM_LOSS); 1079 if (nest == NULL) 1080 goto nla_put_failure; 1081 1082 switch (q->loss_model) { 1083 case CLG_RANDOM: 1084 /* legacy loss model */ 1085 nla_nest_cancel(skb, nest); 1086 return 0; /* no data */ 1087 1088 case CLG_4_STATES: { 1089 struct tc_netem_gimodel gi = { 1090 .p13 = q->clg.a1, 1091 .p31 = q->clg.a2, 1092 .p32 = q->clg.a3, 1093 .p14 = q->clg.a4, 1094 .p23 = q->clg.a5, 1095 }; 1096 1097 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 1098 goto nla_put_failure; 1099 break; 1100 } 1101 case CLG_GILB_ELL: { 1102 struct tc_netem_gemodel ge = { 1103 .p = q->clg.a1, 1104 .r = q->clg.a2, 1105 .h = q->clg.a3, 1106 .k1 = q->clg.a4, 1107 }; 1108 1109 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 1110 goto nla_put_failure; 1111 break; 1112 } 1113 } 1114 1115 nla_nest_end(skb, nest); 1116 return 0; 1117 1118 nla_put_failure: 1119 nla_nest_cancel(skb, nest); 1120 return -1; 1121 } 1122 1123 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1124 { 1125 const struct netem_sched_data *q = qdisc_priv(sch); 1126 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1127 struct tc_netem_qopt qopt; 1128 struct tc_netem_corr cor; 1129 struct tc_netem_reorder reorder; 1130 struct tc_netem_corrupt corrupt; 1131 struct tc_netem_rate rate; 1132 struct tc_netem_slot slot; 1133 1134 qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency), 1135 UINT_MAX); 1136 qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter), 1137 UINT_MAX); 1138 qopt.limit = q->limit; 1139 qopt.loss = q->loss; 1140 qopt.gap = q->gap; 1141 qopt.duplicate = q->duplicate; 1142 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1143 goto nla_put_failure; 1144 1145 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) 1146 goto nla_put_failure; 1147 1148 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) 1149 goto nla_put_failure; 1150 1151 cor.delay_corr = q->delay_cor.rho; 1152 cor.loss_corr = q->loss_cor.rho; 1153 cor.dup_corr = q->dup_cor.rho; 1154 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1155 goto nla_put_failure; 1156 1157 reorder.probability = q->reorder; 1158 reorder.correlation = q->reorder_cor.rho; 1159 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1160 goto nla_put_failure; 1161 1162 corrupt.probability = q->corrupt; 1163 corrupt.correlation = q->corrupt_cor.rho; 1164 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1165 goto nla_put_failure; 1166 1167 if (q->rate >= (1ULL << 32)) { 1168 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1169 TCA_NETEM_PAD)) 1170 goto nla_put_failure; 1171 rate.rate = ~0U; 1172 } else { 1173 rate.rate = q->rate; 1174 } 1175 rate.packet_overhead = q->packet_overhead; 1176 rate.cell_size = q->cell_size; 1177 rate.cell_overhead = q->cell_overhead; 1178 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1179 goto nla_put_failure; 1180 1181 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1182 goto nla_put_failure; 1183 1184 if (dump_loss_model(q, skb) != 0) 1185 goto nla_put_failure; 1186 1187 if (q->slot_config.min_delay | q->slot_config.max_delay | 1188 q->slot_config.dist_jitter) { 1189 slot = q->slot_config; 1190 if (slot.max_packets == INT_MAX) 1191 slot.max_packets = 0; 1192 if (slot.max_bytes == INT_MAX) 1193 slot.max_bytes = 0; 1194 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) 1195 goto nla_put_failure; 1196 } 1197 1198 return nla_nest_end(skb, nla); 1199 1200 nla_put_failure: 1201 nlmsg_trim(skb, nla); 1202 return -1; 1203 } 1204 1205 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1206 struct sk_buff *skb, struct tcmsg *tcm) 1207 { 1208 struct netem_sched_data *q = qdisc_priv(sch); 1209 1210 if (cl != 1 || !q->qdisc) /* only one class */ 1211 return -ENOENT; 1212 1213 tcm->tcm_handle |= TC_H_MIN(1); 1214 tcm->tcm_info = q->qdisc->handle; 1215 1216 return 0; 1217 } 1218 1219 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1220 struct Qdisc **old, struct netlink_ext_ack *extack) 1221 { 1222 struct netem_sched_data *q = qdisc_priv(sch); 1223 1224 *old = qdisc_replace(sch, new, &q->qdisc); 1225 return 0; 1226 } 1227 1228 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1229 { 1230 struct netem_sched_data *q = qdisc_priv(sch); 1231 return q->qdisc; 1232 } 1233 1234 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1235 { 1236 return 1; 1237 } 1238 1239 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1240 { 1241 if (!walker->stop) { 1242 if (walker->count >= walker->skip) 1243 if (walker->fn(sch, 1, walker) < 0) { 1244 walker->stop = 1; 1245 return; 1246 } 1247 walker->count++; 1248 } 1249 } 1250 1251 static const struct Qdisc_class_ops netem_class_ops = { 1252 .graft = netem_graft, 1253 .leaf = netem_leaf, 1254 .find = netem_find, 1255 .walk = netem_walk, 1256 .dump = netem_dump_class, 1257 }; 1258 1259 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1260 .id = "netem", 1261 .cl_ops = &netem_class_ops, 1262 .priv_size = sizeof(struct netem_sched_data), 1263 .enqueue = netem_enqueue, 1264 .dequeue = netem_dequeue, 1265 .peek = qdisc_peek_dequeued, 1266 .init = netem_init, 1267 .reset = netem_reset, 1268 .destroy = netem_destroy, 1269 .change = netem_change, 1270 .dump = netem_dump, 1271 .owner = THIS_MODULE, 1272 }; 1273 1274 1275 static int __init netem_module_init(void) 1276 { 1277 pr_info("netem: version " VERSION "\n"); 1278 return register_qdisc(&netem_qdisc_ops); 1279 } 1280 static void __exit netem_module_exit(void) 1281 { 1282 unregister_qdisc(&netem_qdisc_ops); 1283 } 1284 module_init(netem_module_init) 1285 module_exit(netem_module_exit) 1286 MODULE_LICENSE("GPL"); 1287