1 /* 2 * net/sched/sch_netem.c Network emulator 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License. 8 * 9 * Many of the algorithms and ideas for this came from 10 * NIST Net which is not copyrighted. 11 * 12 * Authors: Stephen Hemminger <shemminger@osdl.org> 13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/errno.h> 22 #include <linux/skbuff.h> 23 #include <linux/vmalloc.h> 24 #include <linux/rtnetlink.h> 25 #include <linux/reciprocal_div.h> 26 #include <linux/rbtree.h> 27 28 #include <net/netlink.h> 29 #include <net/pkt_sched.h> 30 #include <net/inet_ecn.h> 31 32 #define VERSION "1.3" 33 34 /* Network Emulation Queuing algorithm. 35 ==================================== 36 37 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based 38 Network Emulation Tool 39 [2] Luigi Rizzo, DummyNet for FreeBSD 40 41 ---------------------------------------------------------------- 42 43 This started out as a simple way to delay outgoing packets to 44 test TCP but has grown to include most of the functionality 45 of a full blown network emulator like NISTnet. It can delay 46 packets and add random jitter (and correlation). The random 47 distribution can be loaded from a table as well to provide 48 normal, Pareto, or experimental curves. Packet loss, 49 duplication, and reordering can also be emulated. 50 51 This qdisc does not do classification that can be handled in 52 layering other disciplines. It does not need to do bandwidth 53 control either since that can be handled by using token 54 bucket or other rate control. 55 56 Correlated Loss Generator models 57 58 Added generation of correlated loss according to the 59 "Gilbert-Elliot" model, a 4-state markov model. 60 61 References: 62 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG 63 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general 64 and intuitive loss model for packet networks and its implementation 65 in the Netem module in the Linux kernel", available in [1] 66 67 Authors: Stefano Salsano <stefano.salsano at uniroma2.it 68 Fabio Ludovici <fabio.ludovici at yahoo.it> 69 */ 70 71 struct netem_sched_data { 72 /* internal t(ime)fifo qdisc uses t_root and sch->limit */ 73 struct rb_root t_root; 74 75 /* optional qdisc for classful handling (NULL at netem init) */ 76 struct Qdisc *qdisc; 77 78 struct qdisc_watchdog watchdog; 79 80 psched_tdiff_t latency; 81 psched_tdiff_t jitter; 82 83 u32 loss; 84 u32 ecn; 85 u32 limit; 86 u32 counter; 87 u32 gap; 88 u32 duplicate; 89 u32 reorder; 90 u32 corrupt; 91 u64 rate; 92 s32 packet_overhead; 93 u32 cell_size; 94 struct reciprocal_value cell_size_reciprocal; 95 s32 cell_overhead; 96 97 struct crndstate { 98 u32 last; 99 u32 rho; 100 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; 101 102 struct disttable { 103 u32 size; 104 s16 table[0]; 105 } *delay_dist; 106 107 enum { 108 CLG_RANDOM, 109 CLG_4_STATES, 110 CLG_GILB_ELL, 111 } loss_model; 112 113 enum { 114 TX_IN_GAP_PERIOD = 1, 115 TX_IN_BURST_PERIOD, 116 LOST_IN_GAP_PERIOD, 117 LOST_IN_BURST_PERIOD, 118 } _4_state_model; 119 120 enum { 121 GOOD_STATE = 1, 122 BAD_STATE, 123 } GE_state_model; 124 125 /* Correlated Loss Generation models */ 126 struct clgstate { 127 /* state of the Markov chain */ 128 u8 state; 129 130 /* 4-states and Gilbert-Elliot models */ 131 u32 a1; /* p13 for 4-states or p for GE */ 132 u32 a2; /* p31 for 4-states or r for GE */ 133 u32 a3; /* p32 for 4-states or h for GE */ 134 u32 a4; /* p14 for 4-states or 1-k for GE */ 135 u32 a5; /* p23 used only in 4-states */ 136 } clg; 137 138 }; 139 140 /* Time stamp put into socket buffer control block 141 * Only valid when skbs are in our internal t(ime)fifo queue. 142 * 143 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, 144 * and skb->next & skb->prev are scratch space for a qdisc, 145 * we save skb->tstamp value in skb->cb[] before destroying it. 146 */ 147 struct netem_skb_cb { 148 psched_time_t time_to_send; 149 }; 150 151 152 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) 153 { 154 return rb_entry(rb, struct sk_buff, rbnode); 155 } 156 157 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) 158 { 159 /* we assume we can use skb next/prev/tstamp as storage for rb_node */ 160 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); 161 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; 162 } 163 164 /* init_crandom - initialize correlated random number generator 165 * Use entropy source for initial seed. 166 */ 167 static void init_crandom(struct crndstate *state, unsigned long rho) 168 { 169 state->rho = rho; 170 state->last = prandom_u32(); 171 } 172 173 /* get_crandom - correlated random number generator 174 * Next number depends on last value. 175 * rho is scaled to avoid floating point. 176 */ 177 static u32 get_crandom(struct crndstate *state) 178 { 179 u64 value, rho; 180 unsigned long answer; 181 182 if (state->rho == 0) /* no correlation */ 183 return prandom_u32(); 184 185 value = prandom_u32(); 186 rho = (u64)state->rho + 1; 187 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; 188 state->last = answer; 189 return answer; 190 } 191 192 /* loss_4state - 4-state model loss generator 193 * Generates losses according to the 4-state Markov chain adopted in 194 * the GI (General and Intuitive) loss model. 195 */ 196 static bool loss_4state(struct netem_sched_data *q) 197 { 198 struct clgstate *clg = &q->clg; 199 u32 rnd = prandom_u32(); 200 201 /* 202 * Makes a comparison between rnd and the transition 203 * probabilities outgoing from the current state, then decides the 204 * next state and if the next packet has to be transmitted or lost. 205 * The four states correspond to: 206 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period 207 * LOST_IN_BURST_PERIOD => isolated losses within a gap period 208 * LOST_IN_GAP_PERIOD => lost packets within a burst period 209 * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period 210 */ 211 switch (clg->state) { 212 case TX_IN_GAP_PERIOD: 213 if (rnd < clg->a4) { 214 clg->state = LOST_IN_BURST_PERIOD; 215 return true; 216 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { 217 clg->state = LOST_IN_GAP_PERIOD; 218 return true; 219 } else if (clg->a1 + clg->a4 < rnd) { 220 clg->state = TX_IN_GAP_PERIOD; 221 } 222 223 break; 224 case TX_IN_BURST_PERIOD: 225 if (rnd < clg->a5) { 226 clg->state = LOST_IN_GAP_PERIOD; 227 return true; 228 } else { 229 clg->state = TX_IN_BURST_PERIOD; 230 } 231 232 break; 233 case LOST_IN_GAP_PERIOD: 234 if (rnd < clg->a3) 235 clg->state = TX_IN_BURST_PERIOD; 236 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { 237 clg->state = TX_IN_GAP_PERIOD; 238 } else if (clg->a2 + clg->a3 < rnd) { 239 clg->state = LOST_IN_GAP_PERIOD; 240 return true; 241 } 242 break; 243 case LOST_IN_BURST_PERIOD: 244 clg->state = TX_IN_GAP_PERIOD; 245 break; 246 } 247 248 return false; 249 } 250 251 /* loss_gilb_ell - Gilbert-Elliot model loss generator 252 * Generates losses according to the Gilbert-Elliot loss model or 253 * its special cases (Gilbert or Simple Gilbert) 254 * 255 * Makes a comparison between random number and the transition 256 * probabilities outgoing from the current state, then decides the 257 * next state. A second random number is extracted and the comparison 258 * with the loss probability of the current state decides if the next 259 * packet will be transmitted or lost. 260 */ 261 static bool loss_gilb_ell(struct netem_sched_data *q) 262 { 263 struct clgstate *clg = &q->clg; 264 265 switch (clg->state) { 266 case GOOD_STATE: 267 if (prandom_u32() < clg->a1) 268 clg->state = BAD_STATE; 269 if (prandom_u32() < clg->a4) 270 return true; 271 break; 272 case BAD_STATE: 273 if (prandom_u32() < clg->a2) 274 clg->state = GOOD_STATE; 275 if (prandom_u32() > clg->a3) 276 return true; 277 } 278 279 return false; 280 } 281 282 static bool loss_event(struct netem_sched_data *q) 283 { 284 switch (q->loss_model) { 285 case CLG_RANDOM: 286 /* Random packet drop 0 => none, ~0 => all */ 287 return q->loss && q->loss >= get_crandom(&q->loss_cor); 288 289 case CLG_4_STATES: 290 /* 4state loss model algorithm (used also for GI model) 291 * Extracts a value from the markov 4 state loss generator, 292 * if it is 1 drops a packet and if needed writes the event in 293 * the kernel logs 294 */ 295 return loss_4state(q); 296 297 case CLG_GILB_ELL: 298 /* Gilbert-Elliot loss model algorithm 299 * Extracts a value from the Gilbert-Elliot loss generator, 300 * if it is 1 drops a packet and if needed writes the event in 301 * the kernel logs 302 */ 303 return loss_gilb_ell(q); 304 } 305 306 return false; /* not reached */ 307 } 308 309 310 /* tabledist - return a pseudo-randomly distributed value with mean mu and 311 * std deviation sigma. Uses table lookup to approximate the desired 312 * distribution, and a uniformly-distributed pseudo-random source. 313 */ 314 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, 315 struct crndstate *state, 316 const struct disttable *dist) 317 { 318 psched_tdiff_t x; 319 long t; 320 u32 rnd; 321 322 if (sigma == 0) 323 return mu; 324 325 rnd = get_crandom(state); 326 327 /* default uniform distribution */ 328 if (dist == NULL) 329 return (rnd % (2*sigma)) - sigma + mu; 330 331 t = dist->table[rnd % dist->size]; 332 x = (sigma % NETEM_DIST_SCALE) * t; 333 if (x >= 0) 334 x += NETEM_DIST_SCALE/2; 335 else 336 x -= NETEM_DIST_SCALE/2; 337 338 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; 339 } 340 341 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) 342 { 343 u64 ticks; 344 345 len += q->packet_overhead; 346 347 if (q->cell_size) { 348 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); 349 350 if (len > cells * q->cell_size) /* extra cell needed for remainder */ 351 cells++; 352 len = cells * (q->cell_size + q->cell_overhead); 353 } 354 355 ticks = (u64)len * NSEC_PER_SEC; 356 357 do_div(ticks, q->rate); 358 return PSCHED_NS2TICKS(ticks); 359 } 360 361 static void tfifo_reset(struct Qdisc *sch) 362 { 363 struct netem_sched_data *q = qdisc_priv(sch); 364 struct rb_node *p = rb_first(&q->t_root); 365 366 while (p) { 367 struct sk_buff *skb = netem_rb_to_skb(p); 368 369 p = rb_next(p); 370 rb_erase(&skb->rbnode, &q->t_root); 371 rtnl_kfree_skbs(skb, skb); 372 } 373 } 374 375 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) 376 { 377 struct netem_sched_data *q = qdisc_priv(sch); 378 psched_time_t tnext = netem_skb_cb(nskb)->time_to_send; 379 struct rb_node **p = &q->t_root.rb_node, *parent = NULL; 380 381 while (*p) { 382 struct sk_buff *skb; 383 384 parent = *p; 385 skb = netem_rb_to_skb(parent); 386 if (tnext >= netem_skb_cb(skb)->time_to_send) 387 p = &parent->rb_right; 388 else 389 p = &parent->rb_left; 390 } 391 rb_link_node(&nskb->rbnode, parent, p); 392 rb_insert_color(&nskb->rbnode, &q->t_root); 393 sch->q.qlen++; 394 } 395 396 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead 397 * when we statistically choose to corrupt one, we instead segment it, returning 398 * the first packet to be corrupted, and re-enqueue the remaining frames 399 */ 400 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, 401 struct sk_buff **to_free) 402 { 403 struct sk_buff *segs; 404 netdev_features_t features = netif_skb_features(skb); 405 406 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 407 408 if (IS_ERR_OR_NULL(segs)) { 409 qdisc_drop(skb, sch, to_free); 410 return NULL; 411 } 412 consume_skb(skb); 413 return segs; 414 } 415 416 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) 417 { 418 skb->next = qh->head; 419 420 if (!qh->head) 421 qh->tail = skb; 422 qh->head = skb; 423 qh->qlen++; 424 } 425 426 /* 427 * Insert one skb into qdisc. 428 * Note: parent depends on return value to account for queue length. 429 * NET_XMIT_DROP: queue length didn't change. 430 * NET_XMIT_SUCCESS: one skb was queued. 431 */ 432 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 433 struct sk_buff **to_free) 434 { 435 struct netem_sched_data *q = qdisc_priv(sch); 436 /* We don't fill cb now as skb_unshare() may invalidate it */ 437 struct netem_skb_cb *cb; 438 struct sk_buff *skb2; 439 struct sk_buff *segs = NULL; 440 unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); 441 int nb = 0; 442 int count = 1; 443 int rc = NET_XMIT_SUCCESS; 444 445 /* Random duplication */ 446 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) 447 ++count; 448 449 /* Drop packet? */ 450 if (loss_event(q)) { 451 if (q->ecn && INET_ECN_set_ce(skb)) 452 qdisc_qstats_drop(sch); /* mark packet */ 453 else 454 --count; 455 } 456 if (count == 0) { 457 qdisc_qstats_drop(sch); 458 __qdisc_drop(skb, to_free); 459 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 460 } 461 462 /* If a delay is expected, orphan the skb. (orphaning usually takes 463 * place at TX completion time, so _before_ the link transit delay) 464 */ 465 if (q->latency || q->jitter || q->rate) 466 skb_orphan_partial(skb); 467 468 /* 469 * If we need to duplicate packet, then re-insert at top of the 470 * qdisc tree, since parent queuer expects that only one 471 * skb will be queued. 472 */ 473 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { 474 struct Qdisc *rootq = qdisc_root(sch); 475 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ 476 477 q->duplicate = 0; 478 rootq->enqueue(skb2, rootq, to_free); 479 q->duplicate = dupsave; 480 } 481 482 /* 483 * Randomized packet corruption. 484 * Make copy if needed since we are modifying 485 * If packet is going to be hardware checksummed, then 486 * do it now in software before we mangle it. 487 */ 488 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { 489 if (skb_is_gso(skb)) { 490 segs = netem_segment(skb, sch, to_free); 491 if (!segs) 492 return NET_XMIT_DROP; 493 } else { 494 segs = skb; 495 } 496 497 skb = segs; 498 segs = segs->next; 499 500 skb = skb_unshare(skb, GFP_ATOMIC); 501 if (unlikely(!skb)) { 502 qdisc_qstats_drop(sch); 503 goto finish_segs; 504 } 505 if (skb->ip_summed == CHECKSUM_PARTIAL && 506 skb_checksum_help(skb)) { 507 qdisc_drop(skb, sch, to_free); 508 goto finish_segs; 509 } 510 511 skb->data[prandom_u32() % skb_headlen(skb)] ^= 512 1<<(prandom_u32() % 8); 513 } 514 515 if (unlikely(sch->q.qlen >= sch->limit)) 516 return qdisc_drop(skb, sch, to_free); 517 518 qdisc_qstats_backlog_inc(sch, skb); 519 520 cb = netem_skb_cb(skb); 521 if (q->gap == 0 || /* not doing reordering */ 522 q->counter < q->gap - 1 || /* inside last reordering gap */ 523 q->reorder < get_crandom(&q->reorder_cor)) { 524 psched_time_t now; 525 psched_tdiff_t delay; 526 527 delay = tabledist(q->latency, q->jitter, 528 &q->delay_cor, q->delay_dist); 529 530 now = psched_get_time(); 531 532 if (q->rate) { 533 struct netem_skb_cb *last = NULL; 534 535 if (sch->q.tail) 536 last = netem_skb_cb(sch->q.tail); 537 if (q->t_root.rb_node) { 538 struct sk_buff *t_skb; 539 struct netem_skb_cb *t_last; 540 541 t_skb = netem_rb_to_skb(rb_last(&q->t_root)); 542 t_last = netem_skb_cb(t_skb); 543 if (!last || 544 t_last->time_to_send > last->time_to_send) { 545 last = t_last; 546 } 547 } 548 549 if (last) { 550 /* 551 * Last packet in queue is reference point (now), 552 * calculate this time bonus and subtract 553 * from delay. 554 */ 555 delay -= last->time_to_send - now; 556 delay = max_t(psched_tdiff_t, 0, delay); 557 now = last->time_to_send; 558 } 559 560 delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); 561 } 562 563 cb->time_to_send = now + delay; 564 ++q->counter; 565 tfifo_enqueue(skb, sch); 566 } else { 567 /* 568 * Do re-ordering by putting one out of N packets at the front 569 * of the queue. 570 */ 571 cb->time_to_send = psched_get_time(); 572 q->counter = 0; 573 574 netem_enqueue_skb_head(&sch->q, skb); 575 sch->qstats.requeues++; 576 } 577 578 finish_segs: 579 if (segs) { 580 while (segs) { 581 skb2 = segs->next; 582 segs->next = NULL; 583 qdisc_skb_cb(segs)->pkt_len = segs->len; 584 last_len = segs->len; 585 rc = qdisc_enqueue(segs, sch, to_free); 586 if (rc != NET_XMIT_SUCCESS) { 587 if (net_xmit_drop_count(rc)) 588 qdisc_qstats_drop(sch); 589 } else { 590 nb++; 591 len += last_len; 592 } 593 segs = skb2; 594 } 595 sch->q.qlen += nb; 596 if (nb > 1) 597 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); 598 } 599 return NET_XMIT_SUCCESS; 600 } 601 602 static struct sk_buff *netem_dequeue(struct Qdisc *sch) 603 { 604 struct netem_sched_data *q = qdisc_priv(sch); 605 struct sk_buff *skb; 606 struct rb_node *p; 607 608 tfifo_dequeue: 609 skb = __qdisc_dequeue_head(&sch->q); 610 if (skb) { 611 qdisc_qstats_backlog_dec(sch, skb); 612 deliver: 613 qdisc_bstats_update(sch, skb); 614 return skb; 615 } 616 p = rb_first(&q->t_root); 617 if (p) { 618 psched_time_t time_to_send; 619 620 skb = netem_rb_to_skb(p); 621 622 /* if more time remaining? */ 623 time_to_send = netem_skb_cb(skb)->time_to_send; 624 if (time_to_send <= psched_get_time()) { 625 rb_erase(p, &q->t_root); 626 627 sch->q.qlen--; 628 qdisc_qstats_backlog_dec(sch, skb); 629 skb->next = NULL; 630 skb->prev = NULL; 631 /* skb->dev shares skb->rbnode area, 632 * we need to restore its value. 633 */ 634 skb->dev = qdisc_dev(sch); 635 636 #ifdef CONFIG_NET_CLS_ACT 637 /* 638 * If it's at ingress let's pretend the delay is 639 * from the network (tstamp will be updated). 640 */ 641 if (skb->tc_redirected && skb->tc_from_ingress) 642 skb->tstamp = 0; 643 #endif 644 645 if (q->qdisc) { 646 unsigned int pkt_len = qdisc_pkt_len(skb); 647 struct sk_buff *to_free = NULL; 648 int err; 649 650 err = qdisc_enqueue(skb, q->qdisc, &to_free); 651 kfree_skb_list(to_free); 652 if (err != NET_XMIT_SUCCESS && 653 net_xmit_drop_count(err)) { 654 qdisc_qstats_drop(sch); 655 qdisc_tree_reduce_backlog(sch, 1, 656 pkt_len); 657 } 658 goto tfifo_dequeue; 659 } 660 goto deliver; 661 } 662 663 if (q->qdisc) { 664 skb = q->qdisc->ops->dequeue(q->qdisc); 665 if (skb) 666 goto deliver; 667 } 668 qdisc_watchdog_schedule(&q->watchdog, time_to_send); 669 } 670 671 if (q->qdisc) { 672 skb = q->qdisc->ops->dequeue(q->qdisc); 673 if (skb) 674 goto deliver; 675 } 676 return NULL; 677 } 678 679 static void netem_reset(struct Qdisc *sch) 680 { 681 struct netem_sched_data *q = qdisc_priv(sch); 682 683 qdisc_reset_queue(sch); 684 tfifo_reset(sch); 685 if (q->qdisc) 686 qdisc_reset(q->qdisc); 687 qdisc_watchdog_cancel(&q->watchdog); 688 } 689 690 static void dist_free(struct disttable *d) 691 { 692 kvfree(d); 693 } 694 695 /* 696 * Distribution data is a variable size payload containing 697 * signed 16 bit values. 698 */ 699 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) 700 { 701 struct netem_sched_data *q = qdisc_priv(sch); 702 size_t n = nla_len(attr)/sizeof(__s16); 703 const __s16 *data = nla_data(attr); 704 spinlock_t *root_lock; 705 struct disttable *d; 706 int i; 707 708 if (n > NETEM_DIST_MAX) 709 return -EINVAL; 710 711 d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); 712 if (!d) 713 return -ENOMEM; 714 715 d->size = n; 716 for (i = 0; i < n; i++) 717 d->table[i] = data[i]; 718 719 root_lock = qdisc_root_sleeping_lock(sch); 720 721 spin_lock_bh(root_lock); 722 swap(q->delay_dist, d); 723 spin_unlock_bh(root_lock); 724 725 dist_free(d); 726 return 0; 727 } 728 729 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) 730 { 731 const struct tc_netem_corr *c = nla_data(attr); 732 733 init_crandom(&q->delay_cor, c->delay_corr); 734 init_crandom(&q->loss_cor, c->loss_corr); 735 init_crandom(&q->dup_cor, c->dup_corr); 736 } 737 738 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) 739 { 740 const struct tc_netem_reorder *r = nla_data(attr); 741 742 q->reorder = r->probability; 743 init_crandom(&q->reorder_cor, r->correlation); 744 } 745 746 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) 747 { 748 const struct tc_netem_corrupt *r = nla_data(attr); 749 750 q->corrupt = r->probability; 751 init_crandom(&q->corrupt_cor, r->correlation); 752 } 753 754 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) 755 { 756 const struct tc_netem_rate *r = nla_data(attr); 757 758 q->rate = r->rate; 759 q->packet_overhead = r->packet_overhead; 760 q->cell_size = r->cell_size; 761 q->cell_overhead = r->cell_overhead; 762 if (q->cell_size) 763 q->cell_size_reciprocal = reciprocal_value(q->cell_size); 764 else 765 q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; 766 } 767 768 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) 769 { 770 const struct nlattr *la; 771 int rem; 772 773 nla_for_each_nested(la, attr, rem) { 774 u16 type = nla_type(la); 775 776 switch (type) { 777 case NETEM_LOSS_GI: { 778 const struct tc_netem_gimodel *gi = nla_data(la); 779 780 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { 781 pr_info("netem: incorrect gi model size\n"); 782 return -EINVAL; 783 } 784 785 q->loss_model = CLG_4_STATES; 786 787 q->clg.state = TX_IN_GAP_PERIOD; 788 q->clg.a1 = gi->p13; 789 q->clg.a2 = gi->p31; 790 q->clg.a3 = gi->p32; 791 q->clg.a4 = gi->p14; 792 q->clg.a5 = gi->p23; 793 break; 794 } 795 796 case NETEM_LOSS_GE: { 797 const struct tc_netem_gemodel *ge = nla_data(la); 798 799 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { 800 pr_info("netem: incorrect ge model size\n"); 801 return -EINVAL; 802 } 803 804 q->loss_model = CLG_GILB_ELL; 805 q->clg.state = GOOD_STATE; 806 q->clg.a1 = ge->p; 807 q->clg.a2 = ge->r; 808 q->clg.a3 = ge->h; 809 q->clg.a4 = ge->k1; 810 break; 811 } 812 813 default: 814 pr_info("netem: unknown loss type %u\n", type); 815 return -EINVAL; 816 } 817 } 818 819 return 0; 820 } 821 822 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { 823 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, 824 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, 825 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, 826 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, 827 [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, 828 [TCA_NETEM_ECN] = { .type = NLA_U32 }, 829 [TCA_NETEM_RATE64] = { .type = NLA_U64 }, 830 }; 831 832 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, 833 const struct nla_policy *policy, int len) 834 { 835 int nested_len = nla_len(nla) - NLA_ALIGN(len); 836 837 if (nested_len < 0) { 838 pr_info("netem: invalid attributes len %d\n", nested_len); 839 return -EINVAL; 840 } 841 842 if (nested_len >= nla_attr_size(0)) 843 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), 844 nested_len, policy, NULL); 845 846 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); 847 return 0; 848 } 849 850 /* Parse netlink message to set options */ 851 static int netem_change(struct Qdisc *sch, struct nlattr *opt) 852 { 853 struct netem_sched_data *q = qdisc_priv(sch); 854 struct nlattr *tb[TCA_NETEM_MAX + 1]; 855 struct tc_netem_qopt *qopt; 856 struct clgstate old_clg; 857 int old_loss_model = CLG_RANDOM; 858 int ret; 859 860 if (opt == NULL) 861 return -EINVAL; 862 863 qopt = nla_data(opt); 864 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); 865 if (ret < 0) 866 return ret; 867 868 /* backup q->clg and q->loss_model */ 869 old_clg = q->clg; 870 old_loss_model = q->loss_model; 871 872 if (tb[TCA_NETEM_LOSS]) { 873 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); 874 if (ret) { 875 q->loss_model = old_loss_model; 876 return ret; 877 } 878 } else { 879 q->loss_model = CLG_RANDOM; 880 } 881 882 if (tb[TCA_NETEM_DELAY_DIST]) { 883 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); 884 if (ret) { 885 /* recover clg and loss_model, in case of 886 * q->clg and q->loss_model were modified 887 * in get_loss_clg() 888 */ 889 q->clg = old_clg; 890 q->loss_model = old_loss_model; 891 return ret; 892 } 893 } 894 895 sch->limit = qopt->limit; 896 897 q->latency = qopt->latency; 898 q->jitter = qopt->jitter; 899 q->limit = qopt->limit; 900 q->gap = qopt->gap; 901 q->counter = 0; 902 q->loss = qopt->loss; 903 q->duplicate = qopt->duplicate; 904 905 /* for compatibility with earlier versions. 906 * if gap is set, need to assume 100% probability 907 */ 908 if (q->gap) 909 q->reorder = ~0; 910 911 if (tb[TCA_NETEM_CORR]) 912 get_correlation(q, tb[TCA_NETEM_CORR]); 913 914 if (tb[TCA_NETEM_REORDER]) 915 get_reorder(q, tb[TCA_NETEM_REORDER]); 916 917 if (tb[TCA_NETEM_CORRUPT]) 918 get_corrupt(q, tb[TCA_NETEM_CORRUPT]); 919 920 if (tb[TCA_NETEM_RATE]) 921 get_rate(q, tb[TCA_NETEM_RATE]); 922 923 if (tb[TCA_NETEM_RATE64]) 924 q->rate = max_t(u64, q->rate, 925 nla_get_u64(tb[TCA_NETEM_RATE64])); 926 927 if (tb[TCA_NETEM_ECN]) 928 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); 929 930 return ret; 931 } 932 933 static int netem_init(struct Qdisc *sch, struct nlattr *opt) 934 { 935 struct netem_sched_data *q = qdisc_priv(sch); 936 int ret; 937 938 qdisc_watchdog_init(&q->watchdog, sch); 939 940 if (!opt) 941 return -EINVAL; 942 943 q->loss_model = CLG_RANDOM; 944 ret = netem_change(sch, opt); 945 if (ret) 946 pr_info("netem: change failed\n"); 947 return ret; 948 } 949 950 static void netem_destroy(struct Qdisc *sch) 951 { 952 struct netem_sched_data *q = qdisc_priv(sch); 953 954 qdisc_watchdog_cancel(&q->watchdog); 955 if (q->qdisc) 956 qdisc_destroy(q->qdisc); 957 dist_free(q->delay_dist); 958 } 959 960 static int dump_loss_model(const struct netem_sched_data *q, 961 struct sk_buff *skb) 962 { 963 struct nlattr *nest; 964 965 nest = nla_nest_start(skb, TCA_NETEM_LOSS); 966 if (nest == NULL) 967 goto nla_put_failure; 968 969 switch (q->loss_model) { 970 case CLG_RANDOM: 971 /* legacy loss model */ 972 nla_nest_cancel(skb, nest); 973 return 0; /* no data */ 974 975 case CLG_4_STATES: { 976 struct tc_netem_gimodel gi = { 977 .p13 = q->clg.a1, 978 .p31 = q->clg.a2, 979 .p32 = q->clg.a3, 980 .p14 = q->clg.a4, 981 .p23 = q->clg.a5, 982 }; 983 984 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) 985 goto nla_put_failure; 986 break; 987 } 988 case CLG_GILB_ELL: { 989 struct tc_netem_gemodel ge = { 990 .p = q->clg.a1, 991 .r = q->clg.a2, 992 .h = q->clg.a3, 993 .k1 = q->clg.a4, 994 }; 995 996 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) 997 goto nla_put_failure; 998 break; 999 } 1000 } 1001 1002 nla_nest_end(skb, nest); 1003 return 0; 1004 1005 nla_put_failure: 1006 nla_nest_cancel(skb, nest); 1007 return -1; 1008 } 1009 1010 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) 1011 { 1012 const struct netem_sched_data *q = qdisc_priv(sch); 1013 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); 1014 struct tc_netem_qopt qopt; 1015 struct tc_netem_corr cor; 1016 struct tc_netem_reorder reorder; 1017 struct tc_netem_corrupt corrupt; 1018 struct tc_netem_rate rate; 1019 1020 qopt.latency = q->latency; 1021 qopt.jitter = q->jitter; 1022 qopt.limit = q->limit; 1023 qopt.loss = q->loss; 1024 qopt.gap = q->gap; 1025 qopt.duplicate = q->duplicate; 1026 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) 1027 goto nla_put_failure; 1028 1029 cor.delay_corr = q->delay_cor.rho; 1030 cor.loss_corr = q->loss_cor.rho; 1031 cor.dup_corr = q->dup_cor.rho; 1032 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) 1033 goto nla_put_failure; 1034 1035 reorder.probability = q->reorder; 1036 reorder.correlation = q->reorder_cor.rho; 1037 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) 1038 goto nla_put_failure; 1039 1040 corrupt.probability = q->corrupt; 1041 corrupt.correlation = q->corrupt_cor.rho; 1042 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) 1043 goto nla_put_failure; 1044 1045 if (q->rate >= (1ULL << 32)) { 1046 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, 1047 TCA_NETEM_PAD)) 1048 goto nla_put_failure; 1049 rate.rate = ~0U; 1050 } else { 1051 rate.rate = q->rate; 1052 } 1053 rate.packet_overhead = q->packet_overhead; 1054 rate.cell_size = q->cell_size; 1055 rate.cell_overhead = q->cell_overhead; 1056 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) 1057 goto nla_put_failure; 1058 1059 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) 1060 goto nla_put_failure; 1061 1062 if (dump_loss_model(q, skb) != 0) 1063 goto nla_put_failure; 1064 1065 return nla_nest_end(skb, nla); 1066 1067 nla_put_failure: 1068 nlmsg_trim(skb, nla); 1069 return -1; 1070 } 1071 1072 static int netem_dump_class(struct Qdisc *sch, unsigned long cl, 1073 struct sk_buff *skb, struct tcmsg *tcm) 1074 { 1075 struct netem_sched_data *q = qdisc_priv(sch); 1076 1077 if (cl != 1 || !q->qdisc) /* only one class */ 1078 return -ENOENT; 1079 1080 tcm->tcm_handle |= TC_H_MIN(1); 1081 tcm->tcm_info = q->qdisc->handle; 1082 1083 return 0; 1084 } 1085 1086 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1087 struct Qdisc **old) 1088 { 1089 struct netem_sched_data *q = qdisc_priv(sch); 1090 1091 *old = qdisc_replace(sch, new, &q->qdisc); 1092 return 0; 1093 } 1094 1095 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) 1096 { 1097 struct netem_sched_data *q = qdisc_priv(sch); 1098 return q->qdisc; 1099 } 1100 1101 static unsigned long netem_find(struct Qdisc *sch, u32 classid) 1102 { 1103 return 1; 1104 } 1105 1106 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) 1107 { 1108 if (!walker->stop) { 1109 if (walker->count >= walker->skip) 1110 if (walker->fn(sch, 1, walker) < 0) { 1111 walker->stop = 1; 1112 return; 1113 } 1114 walker->count++; 1115 } 1116 } 1117 1118 static const struct Qdisc_class_ops netem_class_ops = { 1119 .graft = netem_graft, 1120 .leaf = netem_leaf, 1121 .find = netem_find, 1122 .walk = netem_walk, 1123 .dump = netem_dump_class, 1124 }; 1125 1126 static struct Qdisc_ops netem_qdisc_ops __read_mostly = { 1127 .id = "netem", 1128 .cl_ops = &netem_class_ops, 1129 .priv_size = sizeof(struct netem_sched_data), 1130 .enqueue = netem_enqueue, 1131 .dequeue = netem_dequeue, 1132 .peek = qdisc_peek_dequeued, 1133 .init = netem_init, 1134 .reset = netem_reset, 1135 .destroy = netem_destroy, 1136 .change = netem_change, 1137 .dump = netem_dump, 1138 .owner = THIS_MODULE, 1139 }; 1140 1141 1142 static int __init netem_module_init(void) 1143 { 1144 pr_info("netem: version " VERSION "\n"); 1145 return register_qdisc(&netem_qdisc_ops); 1146 } 1147 static void __exit netem_module_exit(void) 1148 { 1149 unregister_qdisc(&netem_qdisc_ops); 1150 } 1151 module_init(netem_module_init) 1152 module_exit(netem_module_exit) 1153 MODULE_LICENSE("GPL"); 1154