1 /* 2 * Fair Queue CoDel discipline 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> 10 */ 11 12 #include <linux/module.h> 13 #include <linux/types.h> 14 #include <linux/kernel.h> 15 #include <linux/jiffies.h> 16 #include <linux/string.h> 17 #include <linux/in.h> 18 #include <linux/errno.h> 19 #include <linux/init.h> 20 #include <linux/skbuff.h> 21 #include <linux/jhash.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 #include <net/netlink.h> 25 #include <net/pkt_sched.h> 26 #include <net/flow_keys.h> 27 #include <net/codel.h> 28 29 /* Fair Queue CoDel. 30 * 31 * Principles : 32 * Packets are classified (internal classifier or external) on flows. 33 * This is a Stochastic model (as we use a hash, several flows 34 * might be hashed on same slot) 35 * Each flow has a CoDel managed queue. 36 * Flows are linked onto two (Round Robin) lists, 37 * so that new flows have priority on old ones. 38 * 39 * For a given flow, packets are not reordered (CoDel uses a FIFO) 40 * head drops only. 41 * ECN capability is on by default. 42 * Low memory footprint (64 bytes per flow) 43 */ 44 45 struct fq_codel_flow { 46 struct sk_buff *head; 47 struct sk_buff *tail; 48 struct list_head flowchain; 49 int deficit; 50 u32 dropped; /* number of drops (or ECN marks) on this flow */ 51 struct codel_vars cvars; 52 }; /* please try to keep this structure <= 64 bytes */ 53 54 struct fq_codel_sched_data { 55 struct tcf_proto *filter_list; /* optional external classifier */ 56 struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 57 u32 *backlogs; /* backlog table [flows_cnt] */ 58 u32 flows_cnt; /* number of flows */ 59 u32 perturbation; /* hash perturbation */ 60 u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 61 struct codel_params cparams; 62 struct codel_stats cstats; 63 u32 drop_overlimit; 64 u32 new_flow_count; 65 66 struct list_head new_flows; /* list of new flows */ 67 struct list_head old_flows; /* list of old flows */ 68 }; 69 70 static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 71 const struct sk_buff *skb) 72 { 73 struct flow_keys keys; 74 unsigned int hash; 75 76 skb_flow_dissect(skb, &keys); 77 hash = jhash_3words((__force u32)keys.dst, 78 (__force u32)keys.src ^ keys.ip_proto, 79 (__force u32)keys.ports, q->perturbation); 80 return ((u64)hash * q->flows_cnt) >> 32; 81 } 82 83 static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 84 int *qerr) 85 { 86 struct fq_codel_sched_data *q = qdisc_priv(sch); 87 struct tcf_result res; 88 int result; 89 90 if (TC_H_MAJ(skb->priority) == sch->handle && 91 TC_H_MIN(skb->priority) > 0 && 92 TC_H_MIN(skb->priority) <= q->flows_cnt) 93 return TC_H_MIN(skb->priority); 94 95 if (!q->filter_list) 96 return fq_codel_hash(q, skb) + 1; 97 98 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 99 result = tc_classify(skb, q->filter_list, &res); 100 if (result >= 0) { 101 #ifdef CONFIG_NET_CLS_ACT 102 switch (result) { 103 case TC_ACT_STOLEN: 104 case TC_ACT_QUEUED: 105 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 106 case TC_ACT_SHOT: 107 return 0; 108 } 109 #endif 110 if (TC_H_MIN(res.classid) <= q->flows_cnt) 111 return TC_H_MIN(res.classid); 112 } 113 return 0; 114 } 115 116 /* helper functions : might be changed when/if skb use a standard list_head */ 117 118 /* remove one skb from head of slot queue */ 119 static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) 120 { 121 struct sk_buff *skb = flow->head; 122 123 flow->head = skb->next; 124 skb->next = NULL; 125 return skb; 126 } 127 128 /* add skb to flow queue (tail add) */ 129 static inline void flow_queue_add(struct fq_codel_flow *flow, 130 struct sk_buff *skb) 131 { 132 if (flow->head == NULL) 133 flow->head = skb; 134 else 135 flow->tail->next = skb; 136 flow->tail = skb; 137 skb->next = NULL; 138 } 139 140 static unsigned int fq_codel_drop(struct Qdisc *sch) 141 { 142 struct fq_codel_sched_data *q = qdisc_priv(sch); 143 struct sk_buff *skb; 144 unsigned int maxbacklog = 0, idx = 0, i, len; 145 struct fq_codel_flow *flow; 146 147 /* Queue is full! Find the fat flow and drop packet from it. 148 * This might sound expensive, but with 1024 flows, we scan 149 * 4KB of memory, and we dont need to handle a complex tree 150 * in fast path (packet queue/enqueue) with many cache misses. 151 */ 152 for (i = 0; i < q->flows_cnt; i++) { 153 if (q->backlogs[i] > maxbacklog) { 154 maxbacklog = q->backlogs[i]; 155 idx = i; 156 } 157 } 158 flow = &q->flows[idx]; 159 skb = dequeue_head(flow); 160 len = qdisc_pkt_len(skb); 161 q->backlogs[idx] -= len; 162 kfree_skb(skb); 163 sch->q.qlen--; 164 sch->qstats.drops++; 165 sch->qstats.backlog -= len; 166 flow->dropped++; 167 return idx; 168 } 169 170 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) 171 { 172 struct fq_codel_sched_data *q = qdisc_priv(sch); 173 unsigned int idx; 174 struct fq_codel_flow *flow; 175 int uninitialized_var(ret); 176 177 idx = fq_codel_classify(skb, sch, &ret); 178 if (idx == 0) { 179 if (ret & __NET_XMIT_BYPASS) 180 sch->qstats.drops++; 181 kfree_skb(skb); 182 return ret; 183 } 184 idx--; 185 186 codel_set_enqueue_time(skb); 187 flow = &q->flows[idx]; 188 flow_queue_add(flow, skb); 189 q->backlogs[idx] += qdisc_pkt_len(skb); 190 sch->qstats.backlog += qdisc_pkt_len(skb); 191 192 if (list_empty(&flow->flowchain)) { 193 list_add_tail(&flow->flowchain, &q->new_flows); 194 codel_vars_init(&flow->cvars); 195 q->new_flow_count++; 196 flow->deficit = q->quantum; 197 flow->dropped = 0; 198 } 199 if (++sch->q.qlen < sch->limit) 200 return NET_XMIT_SUCCESS; 201 202 q->drop_overlimit++; 203 /* Return Congestion Notification only if we dropped a packet 204 * from this flow. 205 */ 206 if (fq_codel_drop(sch) == idx) 207 return NET_XMIT_CN; 208 209 /* As we dropped a packet, better let upper stack know this */ 210 qdisc_tree_decrease_qlen(sch, 1); 211 return NET_XMIT_SUCCESS; 212 } 213 214 /* This is the specific function called from codel_dequeue() 215 * to dequeue a packet from queue. Note: backlog is handled in 216 * codel, we dont need to reduce it here. 217 */ 218 static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) 219 { 220 struct fq_codel_sched_data *q = qdisc_priv(sch); 221 struct fq_codel_flow *flow; 222 struct sk_buff *skb = NULL; 223 224 flow = container_of(vars, struct fq_codel_flow, cvars); 225 if (flow->head) { 226 skb = dequeue_head(flow); 227 q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); 228 sch->q.qlen--; 229 } 230 return skb; 231 } 232 233 static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) 234 { 235 struct fq_codel_sched_data *q = qdisc_priv(sch); 236 struct sk_buff *skb; 237 struct fq_codel_flow *flow; 238 struct list_head *head; 239 u32 prev_drop_count, prev_ecn_mark; 240 241 begin: 242 head = &q->new_flows; 243 if (list_empty(head)) { 244 head = &q->old_flows; 245 if (list_empty(head)) 246 return NULL; 247 } 248 flow = list_first_entry(head, struct fq_codel_flow, flowchain); 249 250 if (flow->deficit <= 0) { 251 flow->deficit += q->quantum; 252 list_move_tail(&flow->flowchain, &q->old_flows); 253 goto begin; 254 } 255 256 prev_drop_count = q->cstats.drop_count; 257 prev_ecn_mark = q->cstats.ecn_mark; 258 259 skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, 260 dequeue); 261 262 flow->dropped += q->cstats.drop_count - prev_drop_count; 263 flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; 264 265 if (!skb) { 266 /* force a pass through old_flows to prevent starvation */ 267 if ((head == &q->new_flows) && !list_empty(&q->old_flows)) 268 list_move_tail(&flow->flowchain, &q->old_flows); 269 else 270 list_del_init(&flow->flowchain); 271 goto begin; 272 } 273 qdisc_bstats_update(sch, skb); 274 flow->deficit -= qdisc_pkt_len(skb); 275 /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, 276 * or HTB crashes. Defer it for next round. 277 */ 278 if (q->cstats.drop_count && sch->q.qlen) { 279 qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 280 q->cstats.drop_count = 0; 281 } 282 return skb; 283 } 284 285 static void fq_codel_reset(struct Qdisc *sch) 286 { 287 struct sk_buff *skb; 288 289 while ((skb = fq_codel_dequeue(sch)) != NULL) 290 kfree_skb(skb); 291 } 292 293 static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { 294 [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, 295 [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, 296 [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, 297 [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, 298 [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, 299 [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, 300 }; 301 302 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) 303 { 304 struct fq_codel_sched_data *q = qdisc_priv(sch); 305 struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; 306 int err; 307 308 if (!opt) 309 return -EINVAL; 310 311 err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); 312 if (err < 0) 313 return err; 314 if (tb[TCA_FQ_CODEL_FLOWS]) { 315 if (q->flows) 316 return -EINVAL; 317 q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); 318 if (!q->flows_cnt || 319 q->flows_cnt > 65536) 320 return -EINVAL; 321 } 322 sch_tree_lock(sch); 323 324 if (tb[TCA_FQ_CODEL_TARGET]) { 325 u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); 326 327 q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; 328 } 329 330 if (tb[TCA_FQ_CODEL_INTERVAL]) { 331 u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); 332 333 q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; 334 } 335 336 if (tb[TCA_FQ_CODEL_LIMIT]) 337 sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); 338 339 if (tb[TCA_FQ_CODEL_ECN]) 340 q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); 341 342 if (tb[TCA_FQ_CODEL_QUANTUM]) 343 q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); 344 345 while (sch->q.qlen > sch->limit) { 346 struct sk_buff *skb = fq_codel_dequeue(sch); 347 348 kfree_skb(skb); 349 q->cstats.drop_count++; 350 } 351 qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 352 q->cstats.drop_count = 0; 353 354 sch_tree_unlock(sch); 355 return 0; 356 } 357 358 static void *fq_codel_zalloc(size_t sz) 359 { 360 void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); 361 362 if (!ptr) 363 ptr = vzalloc(sz); 364 return ptr; 365 } 366 367 static void fq_codel_free(void *addr) 368 { 369 if (addr) { 370 if (is_vmalloc_addr(addr)) 371 vfree(addr); 372 else 373 kfree(addr); 374 } 375 } 376 377 static void fq_codel_destroy(struct Qdisc *sch) 378 { 379 struct fq_codel_sched_data *q = qdisc_priv(sch); 380 381 tcf_destroy_chain(&q->filter_list); 382 fq_codel_free(q->backlogs); 383 fq_codel_free(q->flows); 384 } 385 386 static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) 387 { 388 struct fq_codel_sched_data *q = qdisc_priv(sch); 389 int i; 390 391 sch->limit = 10*1024; 392 q->flows_cnt = 1024; 393 q->quantum = psched_mtu(qdisc_dev(sch)); 394 q->perturbation = net_random(); 395 INIT_LIST_HEAD(&q->new_flows); 396 INIT_LIST_HEAD(&q->old_flows); 397 codel_params_init(&q->cparams); 398 codel_stats_init(&q->cstats); 399 q->cparams.ecn = true; 400 401 if (opt) { 402 int err = fq_codel_change(sch, opt); 403 if (err) 404 return err; 405 } 406 407 if (!q->flows) { 408 q->flows = fq_codel_zalloc(q->flows_cnt * 409 sizeof(struct fq_codel_flow)); 410 if (!q->flows) 411 return -ENOMEM; 412 q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); 413 if (!q->backlogs) { 414 fq_codel_free(q->flows); 415 return -ENOMEM; 416 } 417 for (i = 0; i < q->flows_cnt; i++) { 418 struct fq_codel_flow *flow = q->flows + i; 419 420 INIT_LIST_HEAD(&flow->flowchain); 421 } 422 } 423 if (sch->limit >= 1) 424 sch->flags |= TCQ_F_CAN_BYPASS; 425 else 426 sch->flags &= ~TCQ_F_CAN_BYPASS; 427 return 0; 428 } 429 430 static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) 431 { 432 struct fq_codel_sched_data *q = qdisc_priv(sch); 433 struct nlattr *opts; 434 435 opts = nla_nest_start(skb, TCA_OPTIONS); 436 if (opts == NULL) 437 goto nla_put_failure; 438 439 if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, 440 codel_time_to_us(q->cparams.target)) || 441 nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, 442 sch->limit) || 443 nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, 444 codel_time_to_us(q->cparams.interval)) || 445 nla_put_u32(skb, TCA_FQ_CODEL_ECN, 446 q->cparams.ecn) || 447 nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, 448 q->quantum) || 449 nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, 450 q->flows_cnt)) 451 goto nla_put_failure; 452 453 nla_nest_end(skb, opts); 454 return skb->len; 455 456 nla_put_failure: 457 return -1; 458 } 459 460 static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 461 { 462 struct fq_codel_sched_data *q = qdisc_priv(sch); 463 struct tc_fq_codel_xstats st = { 464 .type = TCA_FQ_CODEL_XSTATS_QDISC, 465 }; 466 struct list_head *pos; 467 468 st.qdisc_stats.maxpacket = q->cstats.maxpacket; 469 st.qdisc_stats.drop_overlimit = q->drop_overlimit; 470 st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; 471 st.qdisc_stats.new_flow_count = q->new_flow_count; 472 473 list_for_each(pos, &q->new_flows) 474 st.qdisc_stats.new_flows_len++; 475 476 list_for_each(pos, &q->old_flows) 477 st.qdisc_stats.old_flows_len++; 478 479 return gnet_stats_copy_app(d, &st, sizeof(st)); 480 } 481 482 static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) 483 { 484 return NULL; 485 } 486 487 static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) 488 { 489 return 0; 490 } 491 492 static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, 493 u32 classid) 494 { 495 /* we cannot bypass queue discipline anymore */ 496 sch->flags &= ~TCQ_F_CAN_BYPASS; 497 return 0; 498 } 499 500 static void fq_codel_put(struct Qdisc *q, unsigned long cl) 501 { 502 } 503 504 static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) 505 { 506 struct fq_codel_sched_data *q = qdisc_priv(sch); 507 508 if (cl) 509 return NULL; 510 return &q->filter_list; 511 } 512 513 static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, 514 struct sk_buff *skb, struct tcmsg *tcm) 515 { 516 tcm->tcm_handle |= TC_H_MIN(cl); 517 return 0; 518 } 519 520 static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, 521 struct gnet_dump *d) 522 { 523 struct fq_codel_sched_data *q = qdisc_priv(sch); 524 u32 idx = cl - 1; 525 struct gnet_stats_queue qs = { 0 }; 526 struct tc_fq_codel_xstats xstats; 527 528 if (idx < q->flows_cnt) { 529 const struct fq_codel_flow *flow = &q->flows[idx]; 530 const struct sk_buff *skb = flow->head; 531 532 memset(&xstats, 0, sizeof(xstats)); 533 xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; 534 xstats.class_stats.deficit = flow->deficit; 535 xstats.class_stats.ldelay = 536 codel_time_to_us(flow->cvars.ldelay); 537 xstats.class_stats.count = flow->cvars.count; 538 xstats.class_stats.lastcount = flow->cvars.lastcount; 539 xstats.class_stats.dropping = flow->cvars.dropping; 540 if (flow->cvars.dropping) { 541 codel_tdiff_t delta = flow->cvars.drop_next - 542 codel_get_time(); 543 544 xstats.class_stats.drop_next = (delta >= 0) ? 545 codel_time_to_us(delta) : 546 -codel_time_to_us(-delta); 547 } 548 while (skb) { 549 qs.qlen++; 550 skb = skb->next; 551 } 552 qs.backlog = q->backlogs[idx]; 553 qs.drops = flow->dropped; 554 } 555 if (gnet_stats_copy_queue(d, &qs) < 0) 556 return -1; 557 if (idx < q->flows_cnt) 558 return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 559 return 0; 560 } 561 562 static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) 563 { 564 struct fq_codel_sched_data *q = qdisc_priv(sch); 565 unsigned int i; 566 567 if (arg->stop) 568 return; 569 570 for (i = 0; i < q->flows_cnt; i++) { 571 if (list_empty(&q->flows[i].flowchain) || 572 arg->count < arg->skip) { 573 arg->count++; 574 continue; 575 } 576 if (arg->fn(sch, i + 1, arg) < 0) { 577 arg->stop = 1; 578 break; 579 } 580 arg->count++; 581 } 582 } 583 584 static const struct Qdisc_class_ops fq_codel_class_ops = { 585 .leaf = fq_codel_leaf, 586 .get = fq_codel_get, 587 .put = fq_codel_put, 588 .tcf_chain = fq_codel_find_tcf, 589 .bind_tcf = fq_codel_bind, 590 .unbind_tcf = fq_codel_put, 591 .dump = fq_codel_dump_class, 592 .dump_stats = fq_codel_dump_class_stats, 593 .walk = fq_codel_walk, 594 }; 595 596 static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { 597 .cl_ops = &fq_codel_class_ops, 598 .id = "fq_codel", 599 .priv_size = sizeof(struct fq_codel_sched_data), 600 .enqueue = fq_codel_enqueue, 601 .dequeue = fq_codel_dequeue, 602 .peek = qdisc_peek_dequeued, 603 .drop = fq_codel_drop, 604 .init = fq_codel_init, 605 .reset = fq_codel_reset, 606 .destroy = fq_codel_destroy, 607 .change = fq_codel_change, 608 .dump = fq_codel_dump, 609 .dump_stats = fq_codel_dump_stats, 610 .owner = THIS_MODULE, 611 }; 612 613 static int __init fq_codel_module_init(void) 614 { 615 return register_qdisc(&fq_codel_qdisc_ops); 616 } 617 618 static void __exit fq_codel_module_exit(void) 619 { 620 unregister_qdisc(&fq_codel_qdisc_ops); 621 } 622 623 module_init(fq_codel_module_init) 624 module_exit(fq_codel_module_exit) 625 MODULE_AUTHOR("Eric Dumazet"); 626 MODULE_LICENSE("GPL"); 627