1 /* 2 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 3 * 4 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Meant to be mostly used for locally generated traffic : 12 * Fast classification depends on skb->sk being set before reaching us. 13 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 14 * All packets belonging to a socket are considered as a 'flow'. 15 * 16 * Flows are dynamically allocated and stored in a hash table of RB trees 17 * They are also part of one Round Robin 'queues' (new or old flows) 18 * 19 * Burst avoidance (aka pacing) capability : 20 * 21 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 22 * bunch of packets, and this packet scheduler adds delay between 23 * packets to respect rate limitation. 24 * 25 * enqueue() : 26 * - lookup one RB tree (out of 1024 or more) to find the flow. 27 * If non existent flow, create it, add it to the tree. 28 * Add skb to the per flow list of skb (fifo). 29 * - Use a special fifo for high prio packets 30 * 31 * dequeue() : serves flows in Round Robin 32 * Note : When a flow becomes empty, we do not immediately remove it from 33 * rb trees, for performance reasons (its expected to send additional packets, 34 * or SLAB cache will reuse socket for another flow) 35 */ 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/jiffies.h> 41 #include <linux/string.h> 42 #include <linux/in.h> 43 #include <linux/errno.h> 44 #include <linux/init.h> 45 #include <linux/skbuff.h> 46 #include <linux/slab.h> 47 #include <linux/rbtree.h> 48 #include <linux/hash.h> 49 #include <linux/prefetch.h> 50 #include <linux/vmalloc.h> 51 #include <net/netlink.h> 52 #include <net/pkt_sched.h> 53 #include <net/sock.h> 54 #include <net/tcp_states.h> 55 #include <net/tcp.h> 56 57 /* 58 * Per flow structure, dynamically allocated 59 */ 60 struct fq_flow { 61 struct sk_buff *head; /* list of skbs for this flow : first skb */ 62 union { 63 struct sk_buff *tail; /* last skb in the list */ 64 unsigned long age; /* jiffies when flow was emptied, for gc */ 65 }; 66 struct rb_node fq_node; /* anchor in fq_root[] trees */ 67 struct sock *sk; 68 int qlen; /* number of packets in flow queue */ 69 int credit; 70 u32 socket_hash; /* sk_hash */ 71 struct fq_flow *next; /* next pointer in RR lists, or &detached */ 72 73 struct rb_node rate_node; /* anchor in q->delayed tree */ 74 u64 time_next_packet; 75 }; 76 77 struct fq_flow_head { 78 struct fq_flow *first; 79 struct fq_flow *last; 80 }; 81 82 struct fq_sched_data { 83 struct fq_flow_head new_flows; 84 85 struct fq_flow_head old_flows; 86 87 struct rb_root delayed; /* for rate limited flows */ 88 u64 time_next_delayed_flow; 89 90 struct fq_flow internal; /* for non classified or high prio packets */ 91 u32 quantum; 92 u32 initial_quantum; 93 u32 flow_refill_delay; 94 u32 flow_max_rate; /* optional max rate per flow */ 95 u32 flow_plimit; /* max packets per flow */ 96 u32 orphan_mask; /* mask for orphaned skb */ 97 struct rb_root *fq_root; 98 u8 rate_enable; 99 u8 fq_trees_log; 100 101 u32 flows; 102 u32 inactive_flows; 103 u32 throttled_flows; 104 105 u64 stat_gc_flows; 106 u64 stat_internal_packets; 107 u64 stat_tcp_retrans; 108 u64 stat_throttled; 109 u64 stat_flows_plimit; 110 u64 stat_pkts_too_long; 111 u64 stat_allocation_errors; 112 struct qdisc_watchdog watchdog; 113 }; 114 115 /* special value to mark a detached flow (not on old/new list) */ 116 static struct fq_flow detached, throttled; 117 118 static void fq_flow_set_detached(struct fq_flow *f) 119 { 120 f->next = &detached; 121 f->age = jiffies; 122 } 123 124 static bool fq_flow_is_detached(const struct fq_flow *f) 125 { 126 return f->next == &detached; 127 } 128 129 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 130 { 131 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 132 133 while (*p) { 134 struct fq_flow *aux; 135 136 parent = *p; 137 aux = container_of(parent, struct fq_flow, rate_node); 138 if (f->time_next_packet >= aux->time_next_packet) 139 p = &parent->rb_right; 140 else 141 p = &parent->rb_left; 142 } 143 rb_link_node(&f->rate_node, parent, p); 144 rb_insert_color(&f->rate_node, &q->delayed); 145 q->throttled_flows++; 146 q->stat_throttled++; 147 148 f->next = &throttled; 149 if (q->time_next_delayed_flow > f->time_next_packet) 150 q->time_next_delayed_flow = f->time_next_packet; 151 } 152 153 154 static struct kmem_cache *fq_flow_cachep __read_mostly; 155 156 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 157 { 158 if (head->first) 159 head->last->next = flow; 160 else 161 head->first = flow; 162 head->last = flow; 163 flow->next = NULL; 164 } 165 166 /* limit number of collected flows per round */ 167 #define FQ_GC_MAX 8 168 #define FQ_GC_AGE (3*HZ) 169 170 static bool fq_gc_candidate(const struct fq_flow *f) 171 { 172 return fq_flow_is_detached(f) && 173 time_after(jiffies, f->age + FQ_GC_AGE); 174 } 175 176 static void fq_gc(struct fq_sched_data *q, 177 struct rb_root *root, 178 struct sock *sk) 179 { 180 struct fq_flow *f, *tofree[FQ_GC_MAX]; 181 struct rb_node **p, *parent; 182 int fcnt = 0; 183 184 p = &root->rb_node; 185 parent = NULL; 186 while (*p) { 187 parent = *p; 188 189 f = container_of(parent, struct fq_flow, fq_node); 190 if (f->sk == sk) 191 break; 192 193 if (fq_gc_candidate(f)) { 194 tofree[fcnt++] = f; 195 if (fcnt == FQ_GC_MAX) 196 break; 197 } 198 199 if (f->sk > sk) 200 p = &parent->rb_right; 201 else 202 p = &parent->rb_left; 203 } 204 205 q->flows -= fcnt; 206 q->inactive_flows -= fcnt; 207 q->stat_gc_flows += fcnt; 208 while (fcnt) { 209 struct fq_flow *f = tofree[--fcnt]; 210 211 rb_erase(&f->fq_node, root); 212 kmem_cache_free(fq_flow_cachep, f); 213 } 214 } 215 216 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 217 { 218 struct rb_node **p, *parent; 219 struct sock *sk = skb->sk; 220 struct rb_root *root; 221 struct fq_flow *f; 222 223 /* warning: no starvation prevention... */ 224 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 225 return &q->internal; 226 227 /* SYNACK messages are attached to a listener socket. 228 * 1) They are not part of a 'flow' yet 229 * 2) We do not want to rate limit them (eg SYNFLOOD attack), 230 * especially if the listener set SO_MAX_PACING_RATE 231 * 3) We pretend they are orphaned 232 */ 233 if (!sk || sk->sk_state == TCP_LISTEN) { 234 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 235 236 /* By forcing low order bit to 1, we make sure to not 237 * collide with a local flow (socket pointers are word aligned) 238 */ 239 sk = (struct sock *)((hash << 1) | 1UL); 240 skb_orphan(skb); 241 } 242 243 root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; 244 245 if (q->flows >= (2U << q->fq_trees_log) && 246 q->inactive_flows > q->flows/2) 247 fq_gc(q, root, sk); 248 249 p = &root->rb_node; 250 parent = NULL; 251 while (*p) { 252 parent = *p; 253 254 f = container_of(parent, struct fq_flow, fq_node); 255 if (f->sk == sk) { 256 /* socket might have been reallocated, so check 257 * if its sk_hash is the same. 258 * It not, we need to refill credit with 259 * initial quantum 260 */ 261 if (unlikely(skb->sk && 262 f->socket_hash != sk->sk_hash)) { 263 f->credit = q->initial_quantum; 264 f->socket_hash = sk->sk_hash; 265 f->time_next_packet = 0ULL; 266 } 267 return f; 268 } 269 if (f->sk > sk) 270 p = &parent->rb_right; 271 else 272 p = &parent->rb_left; 273 } 274 275 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 276 if (unlikely(!f)) { 277 q->stat_allocation_errors++; 278 return &q->internal; 279 } 280 fq_flow_set_detached(f); 281 f->sk = sk; 282 if (skb->sk) 283 f->socket_hash = sk->sk_hash; 284 f->credit = q->initial_quantum; 285 286 rb_link_node(&f->fq_node, parent, p); 287 rb_insert_color(&f->fq_node, root); 288 289 q->flows++; 290 q->inactive_flows++; 291 return f; 292 } 293 294 295 /* remove one skb from head of flow queue */ 296 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) 297 { 298 struct sk_buff *skb = flow->head; 299 300 if (skb) { 301 flow->head = skb->next; 302 skb->next = NULL; 303 flow->qlen--; 304 qdisc_qstats_backlog_dec(sch, skb); 305 sch->q.qlen--; 306 } 307 return skb; 308 } 309 310 /* We might add in the future detection of retransmits 311 * For the time being, just return false 312 */ 313 static bool skb_is_retransmit(struct sk_buff *skb) 314 { 315 return false; 316 } 317 318 /* add skb to flow queue 319 * flow queue is a linked list, kind of FIFO, except for TCP retransmits 320 * We special case tcp retransmits to be transmitted before other packets. 321 * We rely on fact that TCP retransmits are unlikely, so we do not waste 322 * a separate queue or a pointer. 323 * head-> [retrans pkt 1] 324 * [retrans pkt 2] 325 * [ normal pkt 1] 326 * [ normal pkt 2] 327 * [ normal pkt 3] 328 * tail-> [ normal pkt 4] 329 */ 330 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 331 { 332 struct sk_buff *prev, *head = flow->head; 333 334 skb->next = NULL; 335 if (!head) { 336 flow->head = skb; 337 flow->tail = skb; 338 return; 339 } 340 if (likely(!skb_is_retransmit(skb))) { 341 flow->tail->next = skb; 342 flow->tail = skb; 343 return; 344 } 345 346 /* This skb is a tcp retransmit, 347 * find the last retrans packet in the queue 348 */ 349 prev = NULL; 350 while (skb_is_retransmit(head)) { 351 prev = head; 352 head = head->next; 353 if (!head) 354 break; 355 } 356 if (!prev) { /* no rtx packet in queue, become the new head */ 357 skb->next = flow->head; 358 flow->head = skb; 359 } else { 360 if (prev == flow->tail) 361 flow->tail = skb; 362 else 363 skb->next = prev->next; 364 prev->next = skb; 365 } 366 } 367 368 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 369 { 370 struct fq_sched_data *q = qdisc_priv(sch); 371 struct fq_flow *f; 372 373 if (unlikely(sch->q.qlen >= sch->limit)) 374 return qdisc_drop(skb, sch); 375 376 f = fq_classify(skb, q); 377 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 378 q->stat_flows_plimit++; 379 return qdisc_drop(skb, sch); 380 } 381 382 f->qlen++; 383 if (skb_is_retransmit(skb)) 384 q->stat_tcp_retrans++; 385 qdisc_qstats_backlog_inc(sch, skb); 386 if (fq_flow_is_detached(f)) { 387 fq_flow_add_tail(&q->new_flows, f); 388 if (time_after(jiffies, f->age + q->flow_refill_delay)) 389 f->credit = max_t(u32, f->credit, q->quantum); 390 q->inactive_flows--; 391 } 392 393 /* Note: this overwrites f->age */ 394 flow_queue_add(f, skb); 395 396 if (unlikely(f == &q->internal)) { 397 q->stat_internal_packets++; 398 } 399 sch->q.qlen++; 400 401 return NET_XMIT_SUCCESS; 402 } 403 404 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 405 { 406 struct rb_node *p; 407 408 if (q->time_next_delayed_flow > now) 409 return; 410 411 q->time_next_delayed_flow = ~0ULL; 412 while ((p = rb_first(&q->delayed)) != NULL) { 413 struct fq_flow *f = container_of(p, struct fq_flow, rate_node); 414 415 if (f->time_next_packet > now) { 416 q->time_next_delayed_flow = f->time_next_packet; 417 break; 418 } 419 rb_erase(p, &q->delayed); 420 q->throttled_flows--; 421 fq_flow_add_tail(&q->old_flows, f); 422 } 423 } 424 425 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 426 { 427 struct fq_sched_data *q = qdisc_priv(sch); 428 u64 now = ktime_get_ns(); 429 struct fq_flow_head *head; 430 struct sk_buff *skb; 431 struct fq_flow *f; 432 u32 rate; 433 434 skb = fq_dequeue_head(sch, &q->internal); 435 if (skb) 436 goto out; 437 fq_check_throttled(q, now); 438 begin: 439 head = &q->new_flows; 440 if (!head->first) { 441 head = &q->old_flows; 442 if (!head->first) { 443 if (q->time_next_delayed_flow != ~0ULL) 444 qdisc_watchdog_schedule_ns(&q->watchdog, 445 q->time_next_delayed_flow, 446 false); 447 return NULL; 448 } 449 } 450 f = head->first; 451 452 if (f->credit <= 0) { 453 f->credit += q->quantum; 454 head->first = f->next; 455 fq_flow_add_tail(&q->old_flows, f); 456 goto begin; 457 } 458 459 skb = f->head; 460 if (unlikely(skb && now < f->time_next_packet && 461 !skb_is_tcp_pure_ack(skb))) { 462 head->first = f->next; 463 fq_flow_set_throttled(q, f); 464 goto begin; 465 } 466 467 skb = fq_dequeue_head(sch, f); 468 if (!skb) { 469 head->first = f->next; 470 /* force a pass through old_flows to prevent starvation */ 471 if ((head == &q->new_flows) && q->old_flows.first) { 472 fq_flow_add_tail(&q->old_flows, f); 473 } else { 474 fq_flow_set_detached(f); 475 q->inactive_flows++; 476 } 477 goto begin; 478 } 479 prefetch(&skb->end); 480 f->credit -= qdisc_pkt_len(skb); 481 482 if (f->credit > 0 || !q->rate_enable) 483 goto out; 484 485 /* Do not pace locally generated ack packets */ 486 if (skb_is_tcp_pure_ack(skb)) 487 goto out; 488 489 rate = q->flow_max_rate; 490 if (skb->sk) 491 rate = min(skb->sk->sk_pacing_rate, rate); 492 493 if (rate != ~0U) { 494 u32 plen = max(qdisc_pkt_len(skb), q->quantum); 495 u64 len = (u64)plen * NSEC_PER_SEC; 496 497 if (likely(rate)) 498 do_div(len, rate); 499 /* Since socket rate can change later, 500 * clamp the delay to 1 second. 501 * Really, providers of too big packets should be fixed ! 502 */ 503 if (unlikely(len > NSEC_PER_SEC)) { 504 len = NSEC_PER_SEC; 505 q->stat_pkts_too_long++; 506 } 507 508 f->time_next_packet = now + len; 509 } 510 out: 511 qdisc_bstats_update(sch, skb); 512 return skb; 513 } 514 515 static void fq_reset(struct Qdisc *sch) 516 { 517 struct fq_sched_data *q = qdisc_priv(sch); 518 struct rb_root *root; 519 struct sk_buff *skb; 520 struct rb_node *p; 521 struct fq_flow *f; 522 unsigned int idx; 523 524 while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) 525 kfree_skb(skb); 526 527 if (!q->fq_root) 528 return; 529 530 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 531 root = &q->fq_root[idx]; 532 while ((p = rb_first(root)) != NULL) { 533 f = container_of(p, struct fq_flow, fq_node); 534 rb_erase(p, root); 535 536 while ((skb = fq_dequeue_head(sch, f)) != NULL) 537 kfree_skb(skb); 538 539 kmem_cache_free(fq_flow_cachep, f); 540 } 541 } 542 q->new_flows.first = NULL; 543 q->old_flows.first = NULL; 544 q->delayed = RB_ROOT; 545 q->flows = 0; 546 q->inactive_flows = 0; 547 q->throttled_flows = 0; 548 } 549 550 static void fq_rehash(struct fq_sched_data *q, 551 struct rb_root *old_array, u32 old_log, 552 struct rb_root *new_array, u32 new_log) 553 { 554 struct rb_node *op, **np, *parent; 555 struct rb_root *oroot, *nroot; 556 struct fq_flow *of, *nf; 557 int fcnt = 0; 558 u32 idx; 559 560 for (idx = 0; idx < (1U << old_log); idx++) { 561 oroot = &old_array[idx]; 562 while ((op = rb_first(oroot)) != NULL) { 563 rb_erase(op, oroot); 564 of = container_of(op, struct fq_flow, fq_node); 565 if (fq_gc_candidate(of)) { 566 fcnt++; 567 kmem_cache_free(fq_flow_cachep, of); 568 continue; 569 } 570 nroot = &new_array[hash_32((u32)(long)of->sk, new_log)]; 571 572 np = &nroot->rb_node; 573 parent = NULL; 574 while (*np) { 575 parent = *np; 576 577 nf = container_of(parent, struct fq_flow, fq_node); 578 BUG_ON(nf->sk == of->sk); 579 580 if (nf->sk > of->sk) 581 np = &parent->rb_right; 582 else 583 np = &parent->rb_left; 584 } 585 586 rb_link_node(&of->fq_node, parent, np); 587 rb_insert_color(&of->fq_node, nroot); 588 } 589 } 590 q->flows -= fcnt; 591 q->inactive_flows -= fcnt; 592 q->stat_gc_flows += fcnt; 593 } 594 595 static void *fq_alloc_node(size_t sz, int node) 596 { 597 void *ptr; 598 599 ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); 600 if (!ptr) 601 ptr = vmalloc_node(sz, node); 602 return ptr; 603 } 604 605 static void fq_free(void *addr) 606 { 607 kvfree(addr); 608 } 609 610 static int fq_resize(struct Qdisc *sch, u32 log) 611 { 612 struct fq_sched_data *q = qdisc_priv(sch); 613 struct rb_root *array; 614 void *old_fq_root; 615 u32 idx; 616 617 if (q->fq_root && log == q->fq_trees_log) 618 return 0; 619 620 /* If XPS was setup, we can allocate memory on right NUMA node */ 621 array = fq_alloc_node(sizeof(struct rb_root) << log, 622 netdev_queue_numa_node_read(sch->dev_queue)); 623 if (!array) 624 return -ENOMEM; 625 626 for (idx = 0; idx < (1U << log); idx++) 627 array[idx] = RB_ROOT; 628 629 sch_tree_lock(sch); 630 631 old_fq_root = q->fq_root; 632 if (old_fq_root) 633 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 634 635 q->fq_root = array; 636 q->fq_trees_log = log; 637 638 sch_tree_unlock(sch); 639 640 fq_free(old_fq_root); 641 642 return 0; 643 } 644 645 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 646 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 647 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 648 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 649 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 650 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 651 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 652 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 653 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 654 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 655 }; 656 657 static int fq_change(struct Qdisc *sch, struct nlattr *opt) 658 { 659 struct fq_sched_data *q = qdisc_priv(sch); 660 struct nlattr *tb[TCA_FQ_MAX + 1]; 661 int err, drop_count = 0; 662 u32 fq_log; 663 664 if (!opt) 665 return -EINVAL; 666 667 err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); 668 if (err < 0) 669 return err; 670 671 sch_tree_lock(sch); 672 673 fq_log = q->fq_trees_log; 674 675 if (tb[TCA_FQ_BUCKETS_LOG]) { 676 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 677 678 if (nval >= 1 && nval <= ilog2(256*1024)) 679 fq_log = nval; 680 else 681 err = -EINVAL; 682 } 683 if (tb[TCA_FQ_PLIMIT]) 684 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 685 686 if (tb[TCA_FQ_FLOW_PLIMIT]) 687 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 688 689 if (tb[TCA_FQ_QUANTUM]) { 690 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 691 692 if (quantum > 0) 693 q->quantum = quantum; 694 else 695 err = -EINVAL; 696 } 697 698 if (tb[TCA_FQ_INITIAL_QUANTUM]) 699 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 700 701 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 702 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 703 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 704 705 if (tb[TCA_FQ_FLOW_MAX_RATE]) 706 q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 707 708 if (tb[TCA_FQ_RATE_ENABLE]) { 709 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 710 711 if (enable <= 1) 712 q->rate_enable = enable; 713 else 714 err = -EINVAL; 715 } 716 717 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 718 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 719 720 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 721 } 722 723 if (tb[TCA_FQ_ORPHAN_MASK]) 724 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 725 726 if (!err) { 727 sch_tree_unlock(sch); 728 err = fq_resize(sch, fq_log); 729 sch_tree_lock(sch); 730 } 731 while (sch->q.qlen > sch->limit) { 732 struct sk_buff *skb = fq_dequeue(sch); 733 734 if (!skb) 735 break; 736 kfree_skb(skb); 737 drop_count++; 738 } 739 qdisc_tree_decrease_qlen(sch, drop_count); 740 741 sch_tree_unlock(sch); 742 return err; 743 } 744 745 static void fq_destroy(struct Qdisc *sch) 746 { 747 struct fq_sched_data *q = qdisc_priv(sch); 748 749 fq_reset(sch); 750 fq_free(q->fq_root); 751 qdisc_watchdog_cancel(&q->watchdog); 752 } 753 754 static int fq_init(struct Qdisc *sch, struct nlattr *opt) 755 { 756 struct fq_sched_data *q = qdisc_priv(sch); 757 int err; 758 759 sch->limit = 10000; 760 q->flow_plimit = 100; 761 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 762 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 763 q->flow_refill_delay = msecs_to_jiffies(40); 764 q->flow_max_rate = ~0U; 765 q->rate_enable = 1; 766 q->new_flows.first = NULL; 767 q->old_flows.first = NULL; 768 q->delayed = RB_ROOT; 769 q->fq_root = NULL; 770 q->fq_trees_log = ilog2(1024); 771 q->orphan_mask = 1024 - 1; 772 qdisc_watchdog_init(&q->watchdog, sch); 773 774 if (opt) 775 err = fq_change(sch, opt); 776 else 777 err = fq_resize(sch, q->fq_trees_log); 778 779 return err; 780 } 781 782 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 783 { 784 struct fq_sched_data *q = qdisc_priv(sch); 785 struct nlattr *opts; 786 787 opts = nla_nest_start(skb, TCA_OPTIONS); 788 if (opts == NULL) 789 goto nla_put_failure; 790 791 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 792 793 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 794 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 795 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 796 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 797 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 798 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || 799 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 800 jiffies_to_usecs(q->flow_refill_delay)) || 801 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 802 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) 803 goto nla_put_failure; 804 805 return nla_nest_end(skb, opts); 806 807 nla_put_failure: 808 return -1; 809 } 810 811 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 812 { 813 struct fq_sched_data *q = qdisc_priv(sch); 814 u64 now = ktime_get_ns(); 815 struct tc_fq_qd_stats st = { 816 .gc_flows = q->stat_gc_flows, 817 .highprio_packets = q->stat_internal_packets, 818 .tcp_retrans = q->stat_tcp_retrans, 819 .throttled = q->stat_throttled, 820 .flows_plimit = q->stat_flows_plimit, 821 .pkts_too_long = q->stat_pkts_too_long, 822 .allocation_errors = q->stat_allocation_errors, 823 .flows = q->flows, 824 .inactive_flows = q->inactive_flows, 825 .throttled_flows = q->throttled_flows, 826 .time_next_delayed_flow = q->time_next_delayed_flow - now, 827 }; 828 829 return gnet_stats_copy_app(d, &st, sizeof(st)); 830 } 831 832 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 833 .id = "fq", 834 .priv_size = sizeof(struct fq_sched_data), 835 836 .enqueue = fq_enqueue, 837 .dequeue = fq_dequeue, 838 .peek = qdisc_peek_dequeued, 839 .init = fq_init, 840 .reset = fq_reset, 841 .destroy = fq_destroy, 842 .change = fq_change, 843 .dump = fq_dump, 844 .dump_stats = fq_dump_stats, 845 .owner = THIS_MODULE, 846 }; 847 848 static int __init fq_module_init(void) 849 { 850 int ret; 851 852 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 853 sizeof(struct fq_flow), 854 0, 0, NULL); 855 if (!fq_flow_cachep) 856 return -ENOMEM; 857 858 ret = register_qdisc(&fq_qdisc_ops); 859 if (ret) 860 kmem_cache_destroy(fq_flow_cachep); 861 return ret; 862 } 863 864 static void __exit fq_module_exit(void) 865 { 866 unregister_qdisc(&fq_qdisc_ops); 867 kmem_cache_destroy(fq_flow_cachep); 868 } 869 870 module_init(fq_module_init) 871 module_exit(fq_module_exit) 872 MODULE_AUTHOR("Eric Dumazet"); 873 MODULE_LICENSE("GPL"); 874