1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) 4 * 5 * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> 6 * 7 * Meant to be mostly used for locally generated traffic : 8 * Fast classification depends on skb->sk being set before reaching us. 9 * If not, (router workload), we use rxhash as fallback, with 32 bits wide hash. 10 * All packets belonging to a socket are considered as a 'flow'. 11 * 12 * Flows are dynamically allocated and stored in a hash table of RB trees 13 * They are also part of one Round Robin 'queues' (new or old flows) 14 * 15 * Burst avoidance (aka pacing) capability : 16 * 17 * Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a 18 * bunch of packets, and this packet scheduler adds delay between 19 * packets to respect rate limitation. 20 * 21 * enqueue() : 22 * - lookup one RB tree (out of 1024 or more) to find the flow. 23 * If non existent flow, create it, add it to the tree. 24 * Add skb to the per flow list of skb (fifo). 25 * - Use a special fifo for high prio packets 26 * 27 * dequeue() : serves flows in Round Robin 28 * Note : When a flow becomes empty, we do not immediately remove it from 29 * rb trees, for performance reasons (its expected to send additional packets, 30 * or SLAB cache will reuse socket for another flow) 31 */ 32 33 #include <linux/module.h> 34 #include <linux/types.h> 35 #include <linux/kernel.h> 36 #include <linux/jiffies.h> 37 #include <linux/string.h> 38 #include <linux/in.h> 39 #include <linux/errno.h> 40 #include <linux/init.h> 41 #include <linux/skbuff.h> 42 #include <linux/slab.h> 43 #include <linux/rbtree.h> 44 #include <linux/hash.h> 45 #include <linux/prefetch.h> 46 #include <linux/vmalloc.h> 47 #include <net/netlink.h> 48 #include <net/pkt_sched.h> 49 #include <net/sock.h> 50 #include <net/tcp_states.h> 51 #include <net/tcp.h> 52 53 struct fq_skb_cb { 54 u64 time_to_send; 55 }; 56 57 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) 58 { 59 qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); 60 return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; 61 } 62 63 /* 64 * Per flow structure, dynamically allocated. 65 * If packets have monotically increasing time_to_send, they are placed in O(1) 66 * in linear list (head,tail), otherwise are placed in a rbtree (t_root). 67 */ 68 struct fq_flow { 69 struct rb_root t_root; 70 struct sk_buff *head; /* list of skbs for this flow : first skb */ 71 union { 72 struct sk_buff *tail; /* last skb in the list */ 73 unsigned long age; /* jiffies when flow was emptied, for gc */ 74 }; 75 struct rb_node fq_node; /* anchor in fq_root[] trees */ 76 struct sock *sk; 77 int qlen; /* number of packets in flow queue */ 78 int credit; 79 u32 socket_hash; /* sk_hash */ 80 struct fq_flow *next; /* next pointer in RR lists, or &detached */ 81 82 struct rb_node rate_node; /* anchor in q->delayed tree */ 83 u64 time_next_packet; 84 }; 85 86 struct fq_flow_head { 87 struct fq_flow *first; 88 struct fq_flow *last; 89 }; 90 91 struct fq_sched_data { 92 struct fq_flow_head new_flows; 93 94 struct fq_flow_head old_flows; 95 96 struct rb_root delayed; /* for rate limited flows */ 97 u64 time_next_delayed_flow; 98 unsigned long unthrottle_latency_ns; 99 100 struct fq_flow internal; /* for non classified or high prio packets */ 101 u32 quantum; 102 u32 initial_quantum; 103 u32 flow_refill_delay; 104 u32 flow_plimit; /* max packets per flow */ 105 unsigned long flow_max_rate; /* optional max rate per flow */ 106 u64 ce_threshold; 107 u32 orphan_mask; /* mask for orphaned skb */ 108 u32 low_rate_threshold; 109 struct rb_root *fq_root; 110 u8 rate_enable; 111 u8 fq_trees_log; 112 113 u32 flows; 114 u32 inactive_flows; 115 u32 throttled_flows; 116 117 u64 stat_gc_flows; 118 u64 stat_internal_packets; 119 u64 stat_throttled; 120 u64 stat_ce_mark; 121 u64 stat_flows_plimit; 122 u64 stat_pkts_too_long; 123 u64 stat_allocation_errors; 124 125 u32 timer_slack; /* hrtimer slack in ns */ 126 struct qdisc_watchdog watchdog; 127 }; 128 129 /* special value to mark a detached flow (not on old/new list) */ 130 static struct fq_flow detached, throttled; 131 132 static void fq_flow_set_detached(struct fq_flow *f) 133 { 134 f->next = &detached; 135 f->age = jiffies; 136 } 137 138 static bool fq_flow_is_detached(const struct fq_flow *f) 139 { 140 return f->next == &detached; 141 } 142 143 static bool fq_flow_is_throttled(const struct fq_flow *f) 144 { 145 return f->next == &throttled; 146 } 147 148 static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow) 149 { 150 if (head->first) 151 head->last->next = flow; 152 else 153 head->first = flow; 154 head->last = flow; 155 flow->next = NULL; 156 } 157 158 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f) 159 { 160 rb_erase(&f->rate_node, &q->delayed); 161 q->throttled_flows--; 162 fq_flow_add_tail(&q->old_flows, f); 163 } 164 165 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) 166 { 167 struct rb_node **p = &q->delayed.rb_node, *parent = NULL; 168 169 while (*p) { 170 struct fq_flow *aux; 171 172 parent = *p; 173 aux = rb_entry(parent, struct fq_flow, rate_node); 174 if (f->time_next_packet >= aux->time_next_packet) 175 p = &parent->rb_right; 176 else 177 p = &parent->rb_left; 178 } 179 rb_link_node(&f->rate_node, parent, p); 180 rb_insert_color(&f->rate_node, &q->delayed); 181 q->throttled_flows++; 182 q->stat_throttled++; 183 184 f->next = &throttled; 185 if (q->time_next_delayed_flow > f->time_next_packet) 186 q->time_next_delayed_flow = f->time_next_packet; 187 } 188 189 190 static struct kmem_cache *fq_flow_cachep __read_mostly; 191 192 193 /* limit number of collected flows per round */ 194 #define FQ_GC_MAX 8 195 #define FQ_GC_AGE (3*HZ) 196 197 static bool fq_gc_candidate(const struct fq_flow *f) 198 { 199 return fq_flow_is_detached(f) && 200 time_after(jiffies, f->age + FQ_GC_AGE); 201 } 202 203 static void fq_gc(struct fq_sched_data *q, 204 struct rb_root *root, 205 struct sock *sk) 206 { 207 struct fq_flow *f, *tofree[FQ_GC_MAX]; 208 struct rb_node **p, *parent; 209 int fcnt = 0; 210 211 p = &root->rb_node; 212 parent = NULL; 213 while (*p) { 214 parent = *p; 215 216 f = rb_entry(parent, struct fq_flow, fq_node); 217 if (f->sk == sk) 218 break; 219 220 if (fq_gc_candidate(f)) { 221 tofree[fcnt++] = f; 222 if (fcnt == FQ_GC_MAX) 223 break; 224 } 225 226 if (f->sk > sk) 227 p = &parent->rb_right; 228 else 229 p = &parent->rb_left; 230 } 231 232 q->flows -= fcnt; 233 q->inactive_flows -= fcnt; 234 q->stat_gc_flows += fcnt; 235 while (fcnt) { 236 struct fq_flow *f = tofree[--fcnt]; 237 238 rb_erase(&f->fq_node, root); 239 kmem_cache_free(fq_flow_cachep, f); 240 } 241 } 242 243 static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) 244 { 245 struct rb_node **p, *parent; 246 struct sock *sk = skb->sk; 247 struct rb_root *root; 248 struct fq_flow *f; 249 250 /* warning: no starvation prevention... */ 251 if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) 252 return &q->internal; 253 254 /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket 255 * or a listener (SYNCOOKIE mode) 256 * 1) request sockets are not full blown, 257 * they do not contain sk_pacing_rate 258 * 2) They are not part of a 'flow' yet 259 * 3) We do not want to rate limit them (eg SYNFLOOD attack), 260 * especially if the listener set SO_MAX_PACING_RATE 261 * 4) We pretend they are orphaned 262 */ 263 if (!sk || sk_listener(sk)) { 264 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 265 266 /* By forcing low order bit to 1, we make sure to not 267 * collide with a local flow (socket pointers are word aligned) 268 */ 269 sk = (struct sock *)((hash << 1) | 1UL); 270 skb_orphan(skb); 271 } else if (sk->sk_state == TCP_CLOSE) { 272 unsigned long hash = skb_get_hash(skb) & q->orphan_mask; 273 /* 274 * Sockets in TCP_CLOSE are non connected. 275 * Typical use case is UDP sockets, they can send packets 276 * with sendto() to many different destinations. 277 * We probably could use a generic bit advertising 278 * non connected sockets, instead of sk_state == TCP_CLOSE, 279 * if we care enough. 280 */ 281 sk = (struct sock *)((hash << 1) | 1UL); 282 } 283 284 root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; 285 286 if (q->flows >= (2U << q->fq_trees_log) && 287 q->inactive_flows > q->flows/2) 288 fq_gc(q, root, sk); 289 290 p = &root->rb_node; 291 parent = NULL; 292 while (*p) { 293 parent = *p; 294 295 f = rb_entry(parent, struct fq_flow, fq_node); 296 if (f->sk == sk) { 297 /* socket might have been reallocated, so check 298 * if its sk_hash is the same. 299 * It not, we need to refill credit with 300 * initial quantum 301 */ 302 if (unlikely(skb->sk == sk && 303 f->socket_hash != sk->sk_hash)) { 304 f->credit = q->initial_quantum; 305 f->socket_hash = sk->sk_hash; 306 if (q->rate_enable) 307 smp_store_release(&sk->sk_pacing_status, 308 SK_PACING_FQ); 309 if (fq_flow_is_throttled(f)) 310 fq_flow_unset_throttled(q, f); 311 f->time_next_packet = 0ULL; 312 } 313 return f; 314 } 315 if (f->sk > sk) 316 p = &parent->rb_right; 317 else 318 p = &parent->rb_left; 319 } 320 321 f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN); 322 if (unlikely(!f)) { 323 q->stat_allocation_errors++; 324 return &q->internal; 325 } 326 /* f->t_root is already zeroed after kmem_cache_zalloc() */ 327 328 fq_flow_set_detached(f); 329 f->sk = sk; 330 if (skb->sk == sk) { 331 f->socket_hash = sk->sk_hash; 332 if (q->rate_enable) 333 smp_store_release(&sk->sk_pacing_status, 334 SK_PACING_FQ); 335 } 336 f->credit = q->initial_quantum; 337 338 rb_link_node(&f->fq_node, parent, p); 339 rb_insert_color(&f->fq_node, root); 340 341 q->flows++; 342 q->inactive_flows++; 343 return f; 344 } 345 346 static struct sk_buff *fq_peek(struct fq_flow *flow) 347 { 348 struct sk_buff *skb = skb_rb_first(&flow->t_root); 349 struct sk_buff *head = flow->head; 350 351 if (!skb) 352 return head; 353 354 if (!head) 355 return skb; 356 357 if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) 358 return skb; 359 return head; 360 } 361 362 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, 363 struct sk_buff *skb) 364 { 365 if (skb == flow->head) { 366 flow->head = skb->next; 367 } else { 368 rb_erase(&skb->rbnode, &flow->t_root); 369 skb->dev = qdisc_dev(sch); 370 } 371 } 372 373 /* remove one skb from head of flow queue */ 374 static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) 375 { 376 struct sk_buff *skb = fq_peek(flow); 377 378 if (skb) { 379 fq_erase_head(sch, flow, skb); 380 skb_mark_not_on_list(skb); 381 flow->qlen--; 382 qdisc_qstats_backlog_dec(sch, skb); 383 sch->q.qlen--; 384 } 385 return skb; 386 } 387 388 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) 389 { 390 struct rb_node **p, *parent; 391 struct sk_buff *head, *aux; 392 393 fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); 394 395 head = flow->head; 396 if (!head || 397 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { 398 if (!head) 399 flow->head = skb; 400 else 401 flow->tail->next = skb; 402 flow->tail = skb; 403 skb->next = NULL; 404 return; 405 } 406 407 p = &flow->t_root.rb_node; 408 parent = NULL; 409 410 while (*p) { 411 parent = *p; 412 aux = rb_to_skb(parent); 413 if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) 414 p = &parent->rb_right; 415 else 416 p = &parent->rb_left; 417 } 418 rb_link_node(&skb->rbnode, parent, p); 419 rb_insert_color(&skb->rbnode, &flow->t_root); 420 } 421 422 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 423 struct sk_buff **to_free) 424 { 425 struct fq_sched_data *q = qdisc_priv(sch); 426 struct fq_flow *f; 427 428 if (unlikely(sch->q.qlen >= sch->limit)) 429 return qdisc_drop(skb, sch, to_free); 430 431 f = fq_classify(skb, q); 432 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { 433 q->stat_flows_plimit++; 434 return qdisc_drop(skb, sch, to_free); 435 } 436 437 f->qlen++; 438 qdisc_qstats_backlog_inc(sch, skb); 439 if (fq_flow_is_detached(f)) { 440 fq_flow_add_tail(&q->new_flows, f); 441 if (time_after(jiffies, f->age + q->flow_refill_delay)) 442 f->credit = max_t(u32, f->credit, q->quantum); 443 q->inactive_flows--; 444 } 445 446 /* Note: this overwrites f->age */ 447 flow_queue_add(f, skb); 448 449 if (unlikely(f == &q->internal)) { 450 q->stat_internal_packets++; 451 } 452 sch->q.qlen++; 453 454 return NET_XMIT_SUCCESS; 455 } 456 457 static void fq_check_throttled(struct fq_sched_data *q, u64 now) 458 { 459 unsigned long sample; 460 struct rb_node *p; 461 462 if (q->time_next_delayed_flow > now) 463 return; 464 465 /* Update unthrottle latency EWMA. 466 * This is cheap and can help diagnosing timer/latency problems. 467 */ 468 sample = (unsigned long)(now - q->time_next_delayed_flow); 469 q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; 470 q->unthrottle_latency_ns += sample >> 3; 471 472 q->time_next_delayed_flow = ~0ULL; 473 while ((p = rb_first(&q->delayed)) != NULL) { 474 struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node); 475 476 if (f->time_next_packet > now) { 477 q->time_next_delayed_flow = f->time_next_packet; 478 break; 479 } 480 fq_flow_unset_throttled(q, f); 481 } 482 } 483 484 static struct sk_buff *fq_dequeue(struct Qdisc *sch) 485 { 486 struct fq_sched_data *q = qdisc_priv(sch); 487 struct fq_flow_head *head; 488 struct sk_buff *skb; 489 struct fq_flow *f; 490 unsigned long rate; 491 u32 plen; 492 u64 now; 493 494 if (!sch->q.qlen) 495 return NULL; 496 497 skb = fq_dequeue_head(sch, &q->internal); 498 if (skb) 499 goto out; 500 501 now = ktime_get_ns(); 502 fq_check_throttled(q, now); 503 begin: 504 head = &q->new_flows; 505 if (!head->first) { 506 head = &q->old_flows; 507 if (!head->first) { 508 if (q->time_next_delayed_flow != ~0ULL) 509 qdisc_watchdog_schedule_range_ns(&q->watchdog, 510 q->time_next_delayed_flow, 511 q->timer_slack); 512 return NULL; 513 } 514 } 515 f = head->first; 516 517 if (f->credit <= 0) { 518 f->credit += q->quantum; 519 head->first = f->next; 520 fq_flow_add_tail(&q->old_flows, f); 521 goto begin; 522 } 523 524 skb = fq_peek(f); 525 if (skb) { 526 u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, 527 f->time_next_packet); 528 529 if (now < time_next_packet) { 530 head->first = f->next; 531 f->time_next_packet = time_next_packet; 532 fq_flow_set_throttled(q, f); 533 goto begin; 534 } 535 if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { 536 INET_ECN_set_ce(skb); 537 q->stat_ce_mark++; 538 } 539 } 540 541 skb = fq_dequeue_head(sch, f); 542 if (!skb) { 543 head->first = f->next; 544 /* force a pass through old_flows to prevent starvation */ 545 if ((head == &q->new_flows) && q->old_flows.first) { 546 fq_flow_add_tail(&q->old_flows, f); 547 } else { 548 fq_flow_set_detached(f); 549 q->inactive_flows++; 550 } 551 goto begin; 552 } 553 prefetch(&skb->end); 554 plen = qdisc_pkt_len(skb); 555 f->credit -= plen; 556 557 if (!q->rate_enable) 558 goto out; 559 560 rate = q->flow_max_rate; 561 562 /* If EDT time was provided for this skb, we need to 563 * update f->time_next_packet only if this qdisc enforces 564 * a flow max rate. 565 */ 566 if (!skb->tstamp) { 567 if (skb->sk) 568 rate = min(skb->sk->sk_pacing_rate, rate); 569 570 if (rate <= q->low_rate_threshold) { 571 f->credit = 0; 572 } else { 573 plen = max(plen, q->quantum); 574 if (f->credit > 0) 575 goto out; 576 } 577 } 578 if (rate != ~0UL) { 579 u64 len = (u64)plen * NSEC_PER_SEC; 580 581 if (likely(rate)) 582 len = div64_ul(len, rate); 583 /* Since socket rate can change later, 584 * clamp the delay to 1 second. 585 * Really, providers of too big packets should be fixed ! 586 */ 587 if (unlikely(len > NSEC_PER_SEC)) { 588 len = NSEC_PER_SEC; 589 q->stat_pkts_too_long++; 590 } 591 /* Account for schedule/timers drifts. 592 * f->time_next_packet was set when prior packet was sent, 593 * and current time (@now) can be too late by tens of us. 594 */ 595 if (f->time_next_packet) 596 len -= min(len/2, now - f->time_next_packet); 597 f->time_next_packet = now + len; 598 } 599 out: 600 qdisc_bstats_update(sch, skb); 601 return skb; 602 } 603 604 static void fq_flow_purge(struct fq_flow *flow) 605 { 606 struct rb_node *p = rb_first(&flow->t_root); 607 608 while (p) { 609 struct sk_buff *skb = rb_to_skb(p); 610 611 p = rb_next(p); 612 rb_erase(&skb->rbnode, &flow->t_root); 613 rtnl_kfree_skbs(skb, skb); 614 } 615 rtnl_kfree_skbs(flow->head, flow->tail); 616 flow->head = NULL; 617 flow->qlen = 0; 618 } 619 620 static void fq_reset(struct Qdisc *sch) 621 { 622 struct fq_sched_data *q = qdisc_priv(sch); 623 struct rb_root *root; 624 struct rb_node *p; 625 struct fq_flow *f; 626 unsigned int idx; 627 628 sch->q.qlen = 0; 629 sch->qstats.backlog = 0; 630 631 fq_flow_purge(&q->internal); 632 633 if (!q->fq_root) 634 return; 635 636 for (idx = 0; idx < (1U << q->fq_trees_log); idx++) { 637 root = &q->fq_root[idx]; 638 while ((p = rb_first(root)) != NULL) { 639 f = rb_entry(p, struct fq_flow, fq_node); 640 rb_erase(p, root); 641 642 fq_flow_purge(f); 643 644 kmem_cache_free(fq_flow_cachep, f); 645 } 646 } 647 q->new_flows.first = NULL; 648 q->old_flows.first = NULL; 649 q->delayed = RB_ROOT; 650 q->flows = 0; 651 q->inactive_flows = 0; 652 q->throttled_flows = 0; 653 } 654 655 static void fq_rehash(struct fq_sched_data *q, 656 struct rb_root *old_array, u32 old_log, 657 struct rb_root *new_array, u32 new_log) 658 { 659 struct rb_node *op, **np, *parent; 660 struct rb_root *oroot, *nroot; 661 struct fq_flow *of, *nf; 662 int fcnt = 0; 663 u32 idx; 664 665 for (idx = 0; idx < (1U << old_log); idx++) { 666 oroot = &old_array[idx]; 667 while ((op = rb_first(oroot)) != NULL) { 668 rb_erase(op, oroot); 669 of = rb_entry(op, struct fq_flow, fq_node); 670 if (fq_gc_candidate(of)) { 671 fcnt++; 672 kmem_cache_free(fq_flow_cachep, of); 673 continue; 674 } 675 nroot = &new_array[hash_ptr(of->sk, new_log)]; 676 677 np = &nroot->rb_node; 678 parent = NULL; 679 while (*np) { 680 parent = *np; 681 682 nf = rb_entry(parent, struct fq_flow, fq_node); 683 BUG_ON(nf->sk == of->sk); 684 685 if (nf->sk > of->sk) 686 np = &parent->rb_right; 687 else 688 np = &parent->rb_left; 689 } 690 691 rb_link_node(&of->fq_node, parent, np); 692 rb_insert_color(&of->fq_node, nroot); 693 } 694 } 695 q->flows -= fcnt; 696 q->inactive_flows -= fcnt; 697 q->stat_gc_flows += fcnt; 698 } 699 700 static void fq_free(void *addr) 701 { 702 kvfree(addr); 703 } 704 705 static int fq_resize(struct Qdisc *sch, u32 log) 706 { 707 struct fq_sched_data *q = qdisc_priv(sch); 708 struct rb_root *array; 709 void *old_fq_root; 710 u32 idx; 711 712 if (q->fq_root && log == q->fq_trees_log) 713 return 0; 714 715 /* If XPS was setup, we can allocate memory on right NUMA node */ 716 array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, 717 netdev_queue_numa_node_read(sch->dev_queue)); 718 if (!array) 719 return -ENOMEM; 720 721 for (idx = 0; idx < (1U << log); idx++) 722 array[idx] = RB_ROOT; 723 724 sch_tree_lock(sch); 725 726 old_fq_root = q->fq_root; 727 if (old_fq_root) 728 fq_rehash(q, old_fq_root, q->fq_trees_log, array, log); 729 730 q->fq_root = array; 731 q->fq_trees_log = log; 732 733 sch_tree_unlock(sch); 734 735 fq_free(old_fq_root); 736 737 return 0; 738 } 739 740 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { 741 [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, 742 743 [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, 744 [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, 745 [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, 746 [TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 }, 747 [TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 }, 748 [TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 }, 749 [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, 750 [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, 751 [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, 752 [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, 753 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, 754 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, 755 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, 756 }; 757 758 static int fq_change(struct Qdisc *sch, struct nlattr *opt, 759 struct netlink_ext_ack *extack) 760 { 761 struct fq_sched_data *q = qdisc_priv(sch); 762 struct nlattr *tb[TCA_FQ_MAX + 1]; 763 int err, drop_count = 0; 764 unsigned drop_len = 0; 765 u32 fq_log; 766 767 if (!opt) 768 return -EINVAL; 769 770 err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, 771 NULL); 772 if (err < 0) 773 return err; 774 775 sch_tree_lock(sch); 776 777 fq_log = q->fq_trees_log; 778 779 if (tb[TCA_FQ_BUCKETS_LOG]) { 780 u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]); 781 782 if (nval >= 1 && nval <= ilog2(256*1024)) 783 fq_log = nval; 784 else 785 err = -EINVAL; 786 } 787 if (tb[TCA_FQ_PLIMIT]) 788 sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]); 789 790 if (tb[TCA_FQ_FLOW_PLIMIT]) 791 q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); 792 793 if (tb[TCA_FQ_QUANTUM]) { 794 u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); 795 796 if (quantum > 0 && quantum <= (1 << 20)) { 797 q->quantum = quantum; 798 } else { 799 NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); 800 err = -EINVAL; 801 } 802 } 803 804 if (tb[TCA_FQ_INITIAL_QUANTUM]) 805 q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); 806 807 if (tb[TCA_FQ_FLOW_DEFAULT_RATE]) 808 pr_warn_ratelimited("sch_fq: defrate %u ignored.\n", 809 nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE])); 810 811 if (tb[TCA_FQ_FLOW_MAX_RATE]) { 812 u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); 813 814 q->flow_max_rate = (rate == ~0U) ? ~0UL : rate; 815 } 816 if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) 817 q->low_rate_threshold = 818 nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); 819 820 if (tb[TCA_FQ_RATE_ENABLE]) { 821 u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); 822 823 if (enable <= 1) 824 q->rate_enable = enable; 825 else 826 err = -EINVAL; 827 } 828 829 if (tb[TCA_FQ_FLOW_REFILL_DELAY]) { 830 u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ; 831 832 q->flow_refill_delay = usecs_to_jiffies(usecs_delay); 833 } 834 835 if (tb[TCA_FQ_ORPHAN_MASK]) 836 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]); 837 838 if (tb[TCA_FQ_CE_THRESHOLD]) 839 q->ce_threshold = (u64)NSEC_PER_USEC * 840 nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); 841 842 if (tb[TCA_FQ_TIMER_SLACK]) 843 q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); 844 845 if (!err) { 846 sch_tree_unlock(sch); 847 err = fq_resize(sch, fq_log); 848 sch_tree_lock(sch); 849 } 850 while (sch->q.qlen > sch->limit) { 851 struct sk_buff *skb = fq_dequeue(sch); 852 853 if (!skb) 854 break; 855 drop_len += qdisc_pkt_len(skb); 856 rtnl_kfree_skbs(skb, skb); 857 drop_count++; 858 } 859 qdisc_tree_reduce_backlog(sch, drop_count, drop_len); 860 861 sch_tree_unlock(sch); 862 return err; 863 } 864 865 static void fq_destroy(struct Qdisc *sch) 866 { 867 struct fq_sched_data *q = qdisc_priv(sch); 868 869 fq_reset(sch); 870 fq_free(q->fq_root); 871 qdisc_watchdog_cancel(&q->watchdog); 872 } 873 874 static int fq_init(struct Qdisc *sch, struct nlattr *opt, 875 struct netlink_ext_ack *extack) 876 { 877 struct fq_sched_data *q = qdisc_priv(sch); 878 int err; 879 880 sch->limit = 10000; 881 q->flow_plimit = 100; 882 q->quantum = 2 * psched_mtu(qdisc_dev(sch)); 883 q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); 884 q->flow_refill_delay = msecs_to_jiffies(40); 885 q->flow_max_rate = ~0UL; 886 q->time_next_delayed_flow = ~0ULL; 887 q->rate_enable = 1; 888 q->new_flows.first = NULL; 889 q->old_flows.first = NULL; 890 q->delayed = RB_ROOT; 891 q->fq_root = NULL; 892 q->fq_trees_log = ilog2(1024); 893 q->orphan_mask = 1024 - 1; 894 q->low_rate_threshold = 550000 / 8; 895 896 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ 897 898 /* Default ce_threshold of 4294 seconds */ 899 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; 900 901 qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC); 902 903 if (opt) 904 err = fq_change(sch, opt, extack); 905 else 906 err = fq_resize(sch, q->fq_trees_log); 907 908 return err; 909 } 910 911 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) 912 { 913 struct fq_sched_data *q = qdisc_priv(sch); 914 u64 ce_threshold = q->ce_threshold; 915 struct nlattr *opts; 916 917 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 918 if (opts == NULL) 919 goto nla_put_failure; 920 921 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 922 923 do_div(ce_threshold, NSEC_PER_USEC); 924 925 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 926 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || 927 nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) || 928 nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) || 929 nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) || 930 nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, 931 min_t(unsigned long, q->flow_max_rate, ~0U)) || 932 nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, 933 jiffies_to_usecs(q->flow_refill_delay)) || 934 nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || 935 nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, 936 q->low_rate_threshold) || 937 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || 938 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || 939 nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) 940 goto nla_put_failure; 941 942 return nla_nest_end(skb, opts); 943 944 nla_put_failure: 945 return -1; 946 } 947 948 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 949 { 950 struct fq_sched_data *q = qdisc_priv(sch); 951 struct tc_fq_qd_stats st; 952 953 sch_tree_lock(sch); 954 955 st.gc_flows = q->stat_gc_flows; 956 st.highprio_packets = q->stat_internal_packets; 957 st.tcp_retrans = 0; 958 st.throttled = q->stat_throttled; 959 st.flows_plimit = q->stat_flows_plimit; 960 st.pkts_too_long = q->stat_pkts_too_long; 961 st.allocation_errors = q->stat_allocation_errors; 962 st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - 963 ktime_get_ns(); 964 st.flows = q->flows; 965 st.inactive_flows = q->inactive_flows; 966 st.throttled_flows = q->throttled_flows; 967 st.unthrottle_latency_ns = min_t(unsigned long, 968 q->unthrottle_latency_ns, ~0U); 969 st.ce_mark = q->stat_ce_mark; 970 sch_tree_unlock(sch); 971 972 return gnet_stats_copy_app(d, &st, sizeof(st)); 973 } 974 975 static struct Qdisc_ops fq_qdisc_ops __read_mostly = { 976 .id = "fq", 977 .priv_size = sizeof(struct fq_sched_data), 978 979 .enqueue = fq_enqueue, 980 .dequeue = fq_dequeue, 981 .peek = qdisc_peek_dequeued, 982 .init = fq_init, 983 .reset = fq_reset, 984 .destroy = fq_destroy, 985 .change = fq_change, 986 .dump = fq_dump, 987 .dump_stats = fq_dump_stats, 988 .owner = THIS_MODULE, 989 }; 990 991 static int __init fq_module_init(void) 992 { 993 int ret; 994 995 fq_flow_cachep = kmem_cache_create("fq_flow_cache", 996 sizeof(struct fq_flow), 997 0, 0, NULL); 998 if (!fq_flow_cachep) 999 return -ENOMEM; 1000 1001 ret = register_qdisc(&fq_qdisc_ops); 1002 if (ret) 1003 kmem_cache_destroy(fq_flow_cachep); 1004 return ret; 1005 } 1006 1007 static void __exit fq_module_exit(void) 1008 { 1009 unregister_qdisc(&fq_qdisc_ops); 1010 kmem_cache_destroy(fq_flow_cachep); 1011 } 1012 1013 module_init(fq_module_init) 1014 module_exit(fq_module_exit) 1015 MODULE_AUTHOR("Eric Dumazet"); 1016 MODULE_LICENSE("GPL"); 1017