14b549a2eSEric Dumazet /* 24b549a2eSEric Dumazet * Fair Queue CoDel discipline 34b549a2eSEric Dumazet * 44b549a2eSEric Dumazet * This program is free software; you can redistribute it and/or 54b549a2eSEric Dumazet * modify it under the terms of the GNU General Public License 64b549a2eSEric Dumazet * as published by the Free Software Foundation; either version 74b549a2eSEric Dumazet * 2 of the License, or (at your option) any later version. 84b549a2eSEric Dumazet * 980ba92faSEric Dumazet * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> 104b549a2eSEric Dumazet */ 114b549a2eSEric Dumazet 124b549a2eSEric Dumazet #include <linux/module.h> 134b549a2eSEric Dumazet #include <linux/types.h> 144b549a2eSEric Dumazet #include <linux/kernel.h> 154b549a2eSEric Dumazet #include <linux/jiffies.h> 164b549a2eSEric Dumazet #include <linux/string.h> 174b549a2eSEric Dumazet #include <linux/in.h> 184b549a2eSEric Dumazet #include <linux/errno.h> 194b549a2eSEric Dumazet #include <linux/init.h> 204b549a2eSEric Dumazet #include <linux/skbuff.h> 214b549a2eSEric Dumazet #include <linux/jhash.h> 224b549a2eSEric Dumazet #include <linux/slab.h> 234b549a2eSEric Dumazet #include <linux/vmalloc.h> 244b549a2eSEric Dumazet #include <net/netlink.h> 254b549a2eSEric Dumazet #include <net/pkt_sched.h> 264b549a2eSEric Dumazet #include <net/codel.h> 274b549a2eSEric Dumazet 284b549a2eSEric Dumazet /* Fair Queue CoDel. 294b549a2eSEric Dumazet * 304b549a2eSEric Dumazet * Principles : 314b549a2eSEric Dumazet * Packets are classified (internal classifier or external) on flows. 324b549a2eSEric Dumazet * This is a Stochastic model (as we use a hash, several flows 334b549a2eSEric Dumazet * might be hashed on same slot) 344b549a2eSEric Dumazet * Each flow has a CoDel managed queue. 354b549a2eSEric Dumazet * Flows are linked onto two (Round Robin) lists, 364b549a2eSEric Dumazet * so that new flows have priority on old ones. 374b549a2eSEric Dumazet * 384b549a2eSEric Dumazet * For a given flow, packets are not reordered (CoDel uses a FIFO) 394b549a2eSEric Dumazet * head drops only. 404b549a2eSEric Dumazet * ECN capability is on by default. 414b549a2eSEric Dumazet * Low memory footprint (64 bytes per flow) 424b549a2eSEric Dumazet */ 434b549a2eSEric Dumazet 444b549a2eSEric Dumazet struct fq_codel_flow { 454b549a2eSEric Dumazet struct sk_buff *head; 464b549a2eSEric Dumazet struct sk_buff *tail; 474b549a2eSEric Dumazet struct list_head flowchain; 484b549a2eSEric Dumazet int deficit; 494b549a2eSEric Dumazet u32 dropped; /* number of drops (or ECN marks) on this flow */ 504b549a2eSEric Dumazet struct codel_vars cvars; 514b549a2eSEric Dumazet }; /* please try to keep this structure <= 64 bytes */ 524b549a2eSEric Dumazet 534b549a2eSEric Dumazet struct fq_codel_sched_data { 5425d8c0d5SJohn Fastabend struct tcf_proto __rcu *filter_list; /* optional external classifier */ 554b549a2eSEric Dumazet struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 564b549a2eSEric Dumazet u32 *backlogs; /* backlog table [flows_cnt] */ 574b549a2eSEric Dumazet u32 flows_cnt; /* number of flows */ 584b549a2eSEric Dumazet u32 perturbation; /* hash perturbation */ 594b549a2eSEric Dumazet u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 604b549a2eSEric Dumazet struct codel_params cparams; 614b549a2eSEric Dumazet struct codel_stats cstats; 624b549a2eSEric Dumazet u32 drop_overlimit; 634b549a2eSEric Dumazet u32 new_flow_count; 644b549a2eSEric Dumazet 654b549a2eSEric Dumazet struct list_head new_flows; /* list of new flows */ 664b549a2eSEric Dumazet struct list_head old_flows; /* list of old flows */ 674b549a2eSEric Dumazet }; 684b549a2eSEric Dumazet 694b549a2eSEric Dumazet static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 70342db221STom Herbert struct sk_buff *skb) 714b549a2eSEric Dumazet { 72342db221STom Herbert u32 hash = skb_get_hash_perturb(skb, q->perturbation); 738fc54f68SDaniel Borkmann 748fc54f68SDaniel Borkmann return reciprocal_scale(hash, q->flows_cnt); 754b549a2eSEric Dumazet } 764b549a2eSEric Dumazet 774b549a2eSEric Dumazet static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 784b549a2eSEric Dumazet int *qerr) 794b549a2eSEric Dumazet { 804b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 8125d8c0d5SJohn Fastabend struct tcf_proto *filter; 824b549a2eSEric Dumazet struct tcf_result res; 834b549a2eSEric Dumazet int result; 844b549a2eSEric Dumazet 854b549a2eSEric Dumazet if (TC_H_MAJ(skb->priority) == sch->handle && 864b549a2eSEric Dumazet TC_H_MIN(skb->priority) > 0 && 874b549a2eSEric Dumazet TC_H_MIN(skb->priority) <= q->flows_cnt) 884b549a2eSEric Dumazet return TC_H_MIN(skb->priority); 894b549a2eSEric Dumazet 9069204cf7SValdis.Kletnieks@vt.edu filter = rcu_dereference_bh(q->filter_list); 9125d8c0d5SJohn Fastabend if (!filter) 924b549a2eSEric Dumazet return fq_codel_hash(q, skb) + 1; 934b549a2eSEric Dumazet 944b549a2eSEric Dumazet *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 953b3ae880SDaniel Borkmann result = tc_classify(skb, filter, &res, false); 964b549a2eSEric Dumazet if (result >= 0) { 974b549a2eSEric Dumazet #ifdef CONFIG_NET_CLS_ACT 984b549a2eSEric Dumazet switch (result) { 994b549a2eSEric Dumazet case TC_ACT_STOLEN: 1004b549a2eSEric Dumazet case TC_ACT_QUEUED: 1014b549a2eSEric Dumazet *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1024b549a2eSEric Dumazet case TC_ACT_SHOT: 1034b549a2eSEric Dumazet return 0; 1044b549a2eSEric Dumazet } 1054b549a2eSEric Dumazet #endif 1064b549a2eSEric Dumazet if (TC_H_MIN(res.classid) <= q->flows_cnt) 1074b549a2eSEric Dumazet return TC_H_MIN(res.classid); 1084b549a2eSEric Dumazet } 1094b549a2eSEric Dumazet return 0; 1104b549a2eSEric Dumazet } 1114b549a2eSEric Dumazet 1124b549a2eSEric Dumazet /* helper functions : might be changed when/if skb use a standard list_head */ 1134b549a2eSEric Dumazet 1144b549a2eSEric Dumazet /* remove one skb from head of slot queue */ 1154b549a2eSEric Dumazet static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) 1164b549a2eSEric Dumazet { 1174b549a2eSEric Dumazet struct sk_buff *skb = flow->head; 1184b549a2eSEric Dumazet 1194b549a2eSEric Dumazet flow->head = skb->next; 1204b549a2eSEric Dumazet skb->next = NULL; 1214b549a2eSEric Dumazet return skb; 1224b549a2eSEric Dumazet } 1234b549a2eSEric Dumazet 1244b549a2eSEric Dumazet /* add skb to flow queue (tail add) */ 1254b549a2eSEric Dumazet static inline void flow_queue_add(struct fq_codel_flow *flow, 1264b549a2eSEric Dumazet struct sk_buff *skb) 1274b549a2eSEric Dumazet { 1284b549a2eSEric Dumazet if (flow->head == NULL) 1294b549a2eSEric Dumazet flow->head = skb; 1304b549a2eSEric Dumazet else 1314b549a2eSEric Dumazet flow->tail->next = skb; 1324b549a2eSEric Dumazet flow->tail = skb; 1334b549a2eSEric Dumazet skb->next = NULL; 1344b549a2eSEric Dumazet } 1354b549a2eSEric Dumazet 1364b549a2eSEric Dumazet static unsigned int fq_codel_drop(struct Qdisc *sch) 1374b549a2eSEric Dumazet { 1384b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 1394b549a2eSEric Dumazet struct sk_buff *skb; 1404b549a2eSEric Dumazet unsigned int maxbacklog = 0, idx = 0, i, len; 1414b549a2eSEric Dumazet struct fq_codel_flow *flow; 1424b549a2eSEric Dumazet 1434b549a2eSEric Dumazet /* Queue is full! Find the fat flow and drop packet from it. 1444b549a2eSEric Dumazet * This might sound expensive, but with 1024 flows, we scan 1454b549a2eSEric Dumazet * 4KB of memory, and we dont need to handle a complex tree 1464b549a2eSEric Dumazet * in fast path (packet queue/enqueue) with many cache misses. 1474b549a2eSEric Dumazet */ 1484b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 1494b549a2eSEric Dumazet if (q->backlogs[i] > maxbacklog) { 1504b549a2eSEric Dumazet maxbacklog = q->backlogs[i]; 1514b549a2eSEric Dumazet idx = i; 1524b549a2eSEric Dumazet } 1534b549a2eSEric Dumazet } 1544b549a2eSEric Dumazet flow = &q->flows[idx]; 1554b549a2eSEric Dumazet skb = dequeue_head(flow); 1564b549a2eSEric Dumazet len = qdisc_pkt_len(skb); 1574b549a2eSEric Dumazet q->backlogs[idx] -= len; 1584b549a2eSEric Dumazet sch->q.qlen--; 15925331d6cSJohn Fastabend qdisc_qstats_drop(sch); 16025331d6cSJohn Fastabend qdisc_qstats_backlog_dec(sch, skb); 161052cbda4SWANG Cong kfree_skb(skb); 1624b549a2eSEric Dumazet flow->dropped++; 1634b549a2eSEric Dumazet return idx; 1644b549a2eSEric Dumazet } 1654b549a2eSEric Dumazet 166c0afd9ceSWANG Cong static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) 167c0afd9ceSWANG Cong { 168c0afd9ceSWANG Cong unsigned int prev_backlog; 169c0afd9ceSWANG Cong 170c0afd9ceSWANG Cong prev_backlog = sch->qstats.backlog; 171c0afd9ceSWANG Cong fq_codel_drop(sch); 172c0afd9ceSWANG Cong return prev_backlog - sch->qstats.backlog; 173c0afd9ceSWANG Cong } 174c0afd9ceSWANG Cong 1754b549a2eSEric Dumazet static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1764b549a2eSEric Dumazet { 1774b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 1784b549a2eSEric Dumazet unsigned int idx; 1794b549a2eSEric Dumazet struct fq_codel_flow *flow; 1804b549a2eSEric Dumazet int uninitialized_var(ret); 1814b549a2eSEric Dumazet 1824b549a2eSEric Dumazet idx = fq_codel_classify(skb, sch, &ret); 1834b549a2eSEric Dumazet if (idx == 0) { 1844b549a2eSEric Dumazet if (ret & __NET_XMIT_BYPASS) 18525331d6cSJohn Fastabend qdisc_qstats_drop(sch); 1864b549a2eSEric Dumazet kfree_skb(skb); 1874b549a2eSEric Dumazet return ret; 1884b549a2eSEric Dumazet } 1894b549a2eSEric Dumazet idx--; 1904b549a2eSEric Dumazet 1914b549a2eSEric Dumazet codel_set_enqueue_time(skb); 1924b549a2eSEric Dumazet flow = &q->flows[idx]; 1934b549a2eSEric Dumazet flow_queue_add(flow, skb); 1944b549a2eSEric Dumazet q->backlogs[idx] += qdisc_pkt_len(skb); 19525331d6cSJohn Fastabend qdisc_qstats_backlog_inc(sch, skb); 1964b549a2eSEric Dumazet 1974b549a2eSEric Dumazet if (list_empty(&flow->flowchain)) { 1984b549a2eSEric Dumazet list_add_tail(&flow->flowchain, &q->new_flows); 1994b549a2eSEric Dumazet q->new_flow_count++; 2004b549a2eSEric Dumazet flow->deficit = q->quantum; 2014b549a2eSEric Dumazet flow->dropped = 0; 2024b549a2eSEric Dumazet } 203cd68ddd4SVijay Subramanian if (++sch->q.qlen <= sch->limit) 2044b549a2eSEric Dumazet return NET_XMIT_SUCCESS; 2054b549a2eSEric Dumazet 2064b549a2eSEric Dumazet q->drop_overlimit++; 2074b549a2eSEric Dumazet /* Return Congestion Notification only if we dropped a packet 2084b549a2eSEric Dumazet * from this flow. 2094b549a2eSEric Dumazet */ 2104b549a2eSEric Dumazet if (fq_codel_drop(sch) == idx) 2114b549a2eSEric Dumazet return NET_XMIT_CN; 2124b549a2eSEric Dumazet 2134b549a2eSEric Dumazet /* As we dropped a packet, better let upper stack know this */ 2144b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, 1); 2154b549a2eSEric Dumazet return NET_XMIT_SUCCESS; 2164b549a2eSEric Dumazet } 2174b549a2eSEric Dumazet 2184b549a2eSEric Dumazet /* This is the specific function called from codel_dequeue() 2194b549a2eSEric Dumazet * to dequeue a packet from queue. Note: backlog is handled in 2204b549a2eSEric Dumazet * codel, we dont need to reduce it here. 2214b549a2eSEric Dumazet */ 2224b549a2eSEric Dumazet static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) 2234b549a2eSEric Dumazet { 224865ec552SEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 2254b549a2eSEric Dumazet struct fq_codel_flow *flow; 2264b549a2eSEric Dumazet struct sk_buff *skb = NULL; 2274b549a2eSEric Dumazet 2284b549a2eSEric Dumazet flow = container_of(vars, struct fq_codel_flow, cvars); 2294b549a2eSEric Dumazet if (flow->head) { 2304b549a2eSEric Dumazet skb = dequeue_head(flow); 231865ec552SEric Dumazet q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); 2324b549a2eSEric Dumazet sch->q.qlen--; 2334b549a2eSEric Dumazet } 2344b549a2eSEric Dumazet return skb; 2354b549a2eSEric Dumazet } 2364b549a2eSEric Dumazet 2374b549a2eSEric Dumazet static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) 2384b549a2eSEric Dumazet { 2394b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 2404b549a2eSEric Dumazet struct sk_buff *skb; 2414b549a2eSEric Dumazet struct fq_codel_flow *flow; 2424b549a2eSEric Dumazet struct list_head *head; 2434b549a2eSEric Dumazet u32 prev_drop_count, prev_ecn_mark; 2444b549a2eSEric Dumazet 2454b549a2eSEric Dumazet begin: 2464b549a2eSEric Dumazet head = &q->new_flows; 2474b549a2eSEric Dumazet if (list_empty(head)) { 2484b549a2eSEric Dumazet head = &q->old_flows; 2494b549a2eSEric Dumazet if (list_empty(head)) 2504b549a2eSEric Dumazet return NULL; 2514b549a2eSEric Dumazet } 2524b549a2eSEric Dumazet flow = list_first_entry(head, struct fq_codel_flow, flowchain); 2534b549a2eSEric Dumazet 2544b549a2eSEric Dumazet if (flow->deficit <= 0) { 2554b549a2eSEric Dumazet flow->deficit += q->quantum; 2564b549a2eSEric Dumazet list_move_tail(&flow->flowchain, &q->old_flows); 2574b549a2eSEric Dumazet goto begin; 2584b549a2eSEric Dumazet } 2594b549a2eSEric Dumazet 2604b549a2eSEric Dumazet prev_drop_count = q->cstats.drop_count; 2614b549a2eSEric Dumazet prev_ecn_mark = q->cstats.ecn_mark; 2624b549a2eSEric Dumazet 2634b549a2eSEric Dumazet skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, 264865ec552SEric Dumazet dequeue); 2654b549a2eSEric Dumazet 2664b549a2eSEric Dumazet flow->dropped += q->cstats.drop_count - prev_drop_count; 2674b549a2eSEric Dumazet flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; 2684b549a2eSEric Dumazet 2694b549a2eSEric Dumazet if (!skb) { 2704b549a2eSEric Dumazet /* force a pass through old_flows to prevent starvation */ 2714b549a2eSEric Dumazet if ((head == &q->new_flows) && !list_empty(&q->old_flows)) 2724b549a2eSEric Dumazet list_move_tail(&flow->flowchain, &q->old_flows); 2734b549a2eSEric Dumazet else 2744b549a2eSEric Dumazet list_del_init(&flow->flowchain); 2754b549a2eSEric Dumazet goto begin; 2764b549a2eSEric Dumazet } 2774b549a2eSEric Dumazet qdisc_bstats_update(sch, skb); 2784b549a2eSEric Dumazet flow->deficit -= qdisc_pkt_len(skb); 2794b549a2eSEric Dumazet /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, 2804b549a2eSEric Dumazet * or HTB crashes. Defer it for next round. 2814b549a2eSEric Dumazet */ 2824b549a2eSEric Dumazet if (q->cstats.drop_count && sch->q.qlen) { 2834b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 2844b549a2eSEric Dumazet q->cstats.drop_count = 0; 2854b549a2eSEric Dumazet } 2864b549a2eSEric Dumazet return skb; 2874b549a2eSEric Dumazet } 2884b549a2eSEric Dumazet 2894b549a2eSEric Dumazet static void fq_codel_reset(struct Qdisc *sch) 2904b549a2eSEric Dumazet { 2913d0e0af4SEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 2923d0e0af4SEric Dumazet int i; 2934b549a2eSEric Dumazet 2943d0e0af4SEric Dumazet INIT_LIST_HEAD(&q->new_flows); 2953d0e0af4SEric Dumazet INIT_LIST_HEAD(&q->old_flows); 2963d0e0af4SEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 2973d0e0af4SEric Dumazet struct fq_codel_flow *flow = q->flows + i; 2983d0e0af4SEric Dumazet 2993d0e0af4SEric Dumazet while (flow->head) { 3003d0e0af4SEric Dumazet struct sk_buff *skb = dequeue_head(flow); 3013d0e0af4SEric Dumazet 3023d0e0af4SEric Dumazet qdisc_qstats_backlog_dec(sch, skb); 3034b549a2eSEric Dumazet kfree_skb(skb); 3044b549a2eSEric Dumazet } 3054b549a2eSEric Dumazet 3063d0e0af4SEric Dumazet INIT_LIST_HEAD(&flow->flowchain); 3073d0e0af4SEric Dumazet codel_vars_init(&flow->cvars); 3083d0e0af4SEric Dumazet } 3093d0e0af4SEric Dumazet memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); 3103d0e0af4SEric Dumazet sch->q.qlen = 0; 3113d0e0af4SEric Dumazet } 3123d0e0af4SEric Dumazet 3134b549a2eSEric Dumazet static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { 3144b549a2eSEric Dumazet [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, 3154b549a2eSEric Dumazet [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, 3164b549a2eSEric Dumazet [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, 3174b549a2eSEric Dumazet [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, 3184b549a2eSEric Dumazet [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, 3194b549a2eSEric Dumazet [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, 32080ba92faSEric Dumazet [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, 3214b549a2eSEric Dumazet }; 3224b549a2eSEric Dumazet 3234b549a2eSEric Dumazet static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) 3244b549a2eSEric Dumazet { 3254b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 3264b549a2eSEric Dumazet struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; 3274b549a2eSEric Dumazet int err; 3284b549a2eSEric Dumazet 3294b549a2eSEric Dumazet if (!opt) 3304b549a2eSEric Dumazet return -EINVAL; 3314b549a2eSEric Dumazet 3324b549a2eSEric Dumazet err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); 3334b549a2eSEric Dumazet if (err < 0) 3344b549a2eSEric Dumazet return err; 3354b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_FLOWS]) { 3364b549a2eSEric Dumazet if (q->flows) 3374b549a2eSEric Dumazet return -EINVAL; 3384b549a2eSEric Dumazet q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); 3394b549a2eSEric Dumazet if (!q->flows_cnt || 3404b549a2eSEric Dumazet q->flows_cnt > 65536) 3414b549a2eSEric Dumazet return -EINVAL; 3424b549a2eSEric Dumazet } 3434b549a2eSEric Dumazet sch_tree_lock(sch); 3444b549a2eSEric Dumazet 3454b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_TARGET]) { 3464b549a2eSEric Dumazet u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); 3474b549a2eSEric Dumazet 3484b549a2eSEric Dumazet q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; 3494b549a2eSEric Dumazet } 3504b549a2eSEric Dumazet 35180ba92faSEric Dumazet if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { 35280ba92faSEric Dumazet u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); 35380ba92faSEric Dumazet 35480ba92faSEric Dumazet q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; 35580ba92faSEric Dumazet } 35680ba92faSEric Dumazet 3574b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_INTERVAL]) { 3584b549a2eSEric Dumazet u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); 3594b549a2eSEric Dumazet 3604b549a2eSEric Dumazet q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; 3614b549a2eSEric Dumazet } 3624b549a2eSEric Dumazet 3634b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_LIMIT]) 3644b549a2eSEric Dumazet sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); 3654b549a2eSEric Dumazet 3664b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_ECN]) 3674b549a2eSEric Dumazet q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); 3684b549a2eSEric Dumazet 3694b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_QUANTUM]) 3704b549a2eSEric Dumazet q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); 3714b549a2eSEric Dumazet 3724b549a2eSEric Dumazet while (sch->q.qlen > sch->limit) { 3734b549a2eSEric Dumazet struct sk_buff *skb = fq_codel_dequeue(sch); 3744b549a2eSEric Dumazet 3754b549a2eSEric Dumazet kfree_skb(skb); 3764b549a2eSEric Dumazet q->cstats.drop_count++; 3774b549a2eSEric Dumazet } 3784b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 3794b549a2eSEric Dumazet q->cstats.drop_count = 0; 3804b549a2eSEric Dumazet 3814b549a2eSEric Dumazet sch_tree_unlock(sch); 3824b549a2eSEric Dumazet return 0; 3834b549a2eSEric Dumazet } 3844b549a2eSEric Dumazet 3854b549a2eSEric Dumazet static void *fq_codel_zalloc(size_t sz) 3864b549a2eSEric Dumazet { 3874b549a2eSEric Dumazet void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); 3884b549a2eSEric Dumazet 3894b549a2eSEric Dumazet if (!ptr) 3904b549a2eSEric Dumazet ptr = vzalloc(sz); 3914b549a2eSEric Dumazet return ptr; 3924b549a2eSEric Dumazet } 3934b549a2eSEric Dumazet 3944b549a2eSEric Dumazet static void fq_codel_free(void *addr) 3954b549a2eSEric Dumazet { 3964cb28970SWANG Cong kvfree(addr); 3974b549a2eSEric Dumazet } 3984b549a2eSEric Dumazet 3994b549a2eSEric Dumazet static void fq_codel_destroy(struct Qdisc *sch) 4004b549a2eSEric Dumazet { 4014b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4024b549a2eSEric Dumazet 4034b549a2eSEric Dumazet tcf_destroy_chain(&q->filter_list); 4044b549a2eSEric Dumazet fq_codel_free(q->backlogs); 4054b549a2eSEric Dumazet fq_codel_free(q->flows); 4064b549a2eSEric Dumazet } 4074b549a2eSEric Dumazet 4084b549a2eSEric Dumazet static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) 4094b549a2eSEric Dumazet { 4104b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4114b549a2eSEric Dumazet int i; 4124b549a2eSEric Dumazet 4134b549a2eSEric Dumazet sch->limit = 10*1024; 4144b549a2eSEric Dumazet q->flows_cnt = 1024; 4154b549a2eSEric Dumazet q->quantum = psched_mtu(qdisc_dev(sch)); 41663862b5bSAruna-Hewapathirane q->perturbation = prandom_u32(); 4174b549a2eSEric Dumazet INIT_LIST_HEAD(&q->new_flows); 4184b549a2eSEric Dumazet INIT_LIST_HEAD(&q->old_flows); 419a5d28090SEric Dumazet codel_params_init(&q->cparams, sch); 4204b549a2eSEric Dumazet codel_stats_init(&q->cstats); 4214b549a2eSEric Dumazet q->cparams.ecn = true; 4224b549a2eSEric Dumazet 4234b549a2eSEric Dumazet if (opt) { 4244b549a2eSEric Dumazet int err = fq_codel_change(sch, opt); 4254b549a2eSEric Dumazet if (err) 4264b549a2eSEric Dumazet return err; 4274b549a2eSEric Dumazet } 4284b549a2eSEric Dumazet 4294b549a2eSEric Dumazet if (!q->flows) { 4304b549a2eSEric Dumazet q->flows = fq_codel_zalloc(q->flows_cnt * 4314b549a2eSEric Dumazet sizeof(struct fq_codel_flow)); 4324b549a2eSEric Dumazet if (!q->flows) 4334b549a2eSEric Dumazet return -ENOMEM; 4344b549a2eSEric Dumazet q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); 4354b549a2eSEric Dumazet if (!q->backlogs) { 4364b549a2eSEric Dumazet fq_codel_free(q->flows); 4374b549a2eSEric Dumazet return -ENOMEM; 4384b549a2eSEric Dumazet } 4394b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 4404b549a2eSEric Dumazet struct fq_codel_flow *flow = q->flows + i; 4414b549a2eSEric Dumazet 4424b549a2eSEric Dumazet INIT_LIST_HEAD(&flow->flowchain); 443b379135cSEric Dumazet codel_vars_init(&flow->cvars); 4444b549a2eSEric Dumazet } 4454b549a2eSEric Dumazet } 4464b549a2eSEric Dumazet if (sch->limit >= 1) 4474b549a2eSEric Dumazet sch->flags |= TCQ_F_CAN_BYPASS; 4484b549a2eSEric Dumazet else 4494b549a2eSEric Dumazet sch->flags &= ~TCQ_F_CAN_BYPASS; 4504b549a2eSEric Dumazet return 0; 4514b549a2eSEric Dumazet } 4524b549a2eSEric Dumazet 4534b549a2eSEric Dumazet static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) 4544b549a2eSEric Dumazet { 4554b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4564b549a2eSEric Dumazet struct nlattr *opts; 4574b549a2eSEric Dumazet 4584b549a2eSEric Dumazet opts = nla_nest_start(skb, TCA_OPTIONS); 4594b549a2eSEric Dumazet if (opts == NULL) 4604b549a2eSEric Dumazet goto nla_put_failure; 4614b549a2eSEric Dumazet 4624b549a2eSEric Dumazet if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, 4634b549a2eSEric Dumazet codel_time_to_us(q->cparams.target)) || 4644b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, 4654b549a2eSEric Dumazet sch->limit) || 4664b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, 4674b549a2eSEric Dumazet codel_time_to_us(q->cparams.interval)) || 4684b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_ECN, 4694b549a2eSEric Dumazet q->cparams.ecn) || 4704b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, 4714b549a2eSEric Dumazet q->quantum) || 4724b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, 4734b549a2eSEric Dumazet q->flows_cnt)) 4744b549a2eSEric Dumazet goto nla_put_failure; 4754b549a2eSEric Dumazet 47680ba92faSEric Dumazet if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && 47780ba92faSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, 47880ba92faSEric Dumazet codel_time_to_us(q->cparams.ce_threshold))) 47980ba92faSEric Dumazet goto nla_put_failure; 48080ba92faSEric Dumazet 481d59b7d80SYang Yingliang return nla_nest_end(skb, opts); 4824b549a2eSEric Dumazet 4834b549a2eSEric Dumazet nla_put_failure: 4844b549a2eSEric Dumazet return -1; 4854b549a2eSEric Dumazet } 4864b549a2eSEric Dumazet 4874b549a2eSEric Dumazet static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 4884b549a2eSEric Dumazet { 4894b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4904b549a2eSEric Dumazet struct tc_fq_codel_xstats st = { 4914b549a2eSEric Dumazet .type = TCA_FQ_CODEL_XSTATS_QDISC, 4924b549a2eSEric Dumazet }; 4934b549a2eSEric Dumazet struct list_head *pos; 4944b549a2eSEric Dumazet 495669d67bfSSasha Levin st.qdisc_stats.maxpacket = q->cstats.maxpacket; 496669d67bfSSasha Levin st.qdisc_stats.drop_overlimit = q->drop_overlimit; 497669d67bfSSasha Levin st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; 498669d67bfSSasha Levin st.qdisc_stats.new_flow_count = q->new_flow_count; 49980ba92faSEric Dumazet st.qdisc_stats.ce_mark = q->cstats.ce_mark; 500669d67bfSSasha Levin 5014b549a2eSEric Dumazet list_for_each(pos, &q->new_flows) 5024b549a2eSEric Dumazet st.qdisc_stats.new_flows_len++; 5034b549a2eSEric Dumazet 5044b549a2eSEric Dumazet list_for_each(pos, &q->old_flows) 5054b549a2eSEric Dumazet st.qdisc_stats.old_flows_len++; 5064b549a2eSEric Dumazet 5074b549a2eSEric Dumazet return gnet_stats_copy_app(d, &st, sizeof(st)); 5084b549a2eSEric Dumazet } 5094b549a2eSEric Dumazet 5104b549a2eSEric Dumazet static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) 5114b549a2eSEric Dumazet { 5124b549a2eSEric Dumazet return NULL; 5134b549a2eSEric Dumazet } 5144b549a2eSEric Dumazet 5154b549a2eSEric Dumazet static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) 5164b549a2eSEric Dumazet { 5174b549a2eSEric Dumazet return 0; 5184b549a2eSEric Dumazet } 5194b549a2eSEric Dumazet 5204b549a2eSEric Dumazet static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, 5214b549a2eSEric Dumazet u32 classid) 5224b549a2eSEric Dumazet { 5234b549a2eSEric Dumazet /* we cannot bypass queue discipline anymore */ 5244b549a2eSEric Dumazet sch->flags &= ~TCQ_F_CAN_BYPASS; 5254b549a2eSEric Dumazet return 0; 5264b549a2eSEric Dumazet } 5274b549a2eSEric Dumazet 5284b549a2eSEric Dumazet static void fq_codel_put(struct Qdisc *q, unsigned long cl) 5294b549a2eSEric Dumazet { 5304b549a2eSEric Dumazet } 5314b549a2eSEric Dumazet 53225d8c0d5SJohn Fastabend static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, 53325d8c0d5SJohn Fastabend unsigned long cl) 5344b549a2eSEric Dumazet { 5354b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5364b549a2eSEric Dumazet 5374b549a2eSEric Dumazet if (cl) 5384b549a2eSEric Dumazet return NULL; 5394b549a2eSEric Dumazet return &q->filter_list; 5404b549a2eSEric Dumazet } 5414b549a2eSEric Dumazet 5424b549a2eSEric Dumazet static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, 5434b549a2eSEric Dumazet struct sk_buff *skb, struct tcmsg *tcm) 5444b549a2eSEric Dumazet { 5454b549a2eSEric Dumazet tcm->tcm_handle |= TC_H_MIN(cl); 5464b549a2eSEric Dumazet return 0; 5474b549a2eSEric Dumazet } 5484b549a2eSEric Dumazet 5494b549a2eSEric Dumazet static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, 5504b549a2eSEric Dumazet struct gnet_dump *d) 5514b549a2eSEric Dumazet { 5524b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5534b549a2eSEric Dumazet u32 idx = cl - 1; 5544b549a2eSEric Dumazet struct gnet_stats_queue qs = { 0 }; 5554b549a2eSEric Dumazet struct tc_fq_codel_xstats xstats; 5564b549a2eSEric Dumazet 5574b549a2eSEric Dumazet if (idx < q->flows_cnt) { 5584b549a2eSEric Dumazet const struct fq_codel_flow *flow = &q->flows[idx]; 5594b549a2eSEric Dumazet const struct sk_buff *skb = flow->head; 5604b549a2eSEric Dumazet 5614b549a2eSEric Dumazet memset(&xstats, 0, sizeof(xstats)); 5624b549a2eSEric Dumazet xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; 5634b549a2eSEric Dumazet xstats.class_stats.deficit = flow->deficit; 5644b549a2eSEric Dumazet xstats.class_stats.ldelay = 5654b549a2eSEric Dumazet codel_time_to_us(flow->cvars.ldelay); 5664b549a2eSEric Dumazet xstats.class_stats.count = flow->cvars.count; 5674b549a2eSEric Dumazet xstats.class_stats.lastcount = flow->cvars.lastcount; 5684b549a2eSEric Dumazet xstats.class_stats.dropping = flow->cvars.dropping; 5694b549a2eSEric Dumazet if (flow->cvars.dropping) { 5704b549a2eSEric Dumazet codel_tdiff_t delta = flow->cvars.drop_next - 5714b549a2eSEric Dumazet codel_get_time(); 5724b549a2eSEric Dumazet 5734b549a2eSEric Dumazet xstats.class_stats.drop_next = (delta >= 0) ? 5744b549a2eSEric Dumazet codel_time_to_us(delta) : 5754b549a2eSEric Dumazet -codel_time_to_us(-delta); 5764b549a2eSEric Dumazet } 5774b549a2eSEric Dumazet while (skb) { 5784b549a2eSEric Dumazet qs.qlen++; 5794b549a2eSEric Dumazet skb = skb->next; 5804b549a2eSEric Dumazet } 5814b549a2eSEric Dumazet qs.backlog = q->backlogs[idx]; 5824b549a2eSEric Dumazet qs.drops = flow->dropped; 5834b549a2eSEric Dumazet } 584b0ab6f92SJohn Fastabend if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) 5854b549a2eSEric Dumazet return -1; 5864b549a2eSEric Dumazet if (idx < q->flows_cnt) 5874b549a2eSEric Dumazet return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 5884b549a2eSEric Dumazet return 0; 5894b549a2eSEric Dumazet } 5904b549a2eSEric Dumazet 5914b549a2eSEric Dumazet static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) 5924b549a2eSEric Dumazet { 5934b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5944b549a2eSEric Dumazet unsigned int i; 5954b549a2eSEric Dumazet 5964b549a2eSEric Dumazet if (arg->stop) 5974b549a2eSEric Dumazet return; 5984b549a2eSEric Dumazet 5994b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 6004b549a2eSEric Dumazet if (list_empty(&q->flows[i].flowchain) || 6014b549a2eSEric Dumazet arg->count < arg->skip) { 6024b549a2eSEric Dumazet arg->count++; 6034b549a2eSEric Dumazet continue; 6044b549a2eSEric Dumazet } 6054b549a2eSEric Dumazet if (arg->fn(sch, i + 1, arg) < 0) { 6064b549a2eSEric Dumazet arg->stop = 1; 6074b549a2eSEric Dumazet break; 6084b549a2eSEric Dumazet } 6094b549a2eSEric Dumazet arg->count++; 6104b549a2eSEric Dumazet } 6114b549a2eSEric Dumazet } 6124b549a2eSEric Dumazet 6134b549a2eSEric Dumazet static const struct Qdisc_class_ops fq_codel_class_ops = { 6144b549a2eSEric Dumazet .leaf = fq_codel_leaf, 6154b549a2eSEric Dumazet .get = fq_codel_get, 6164b549a2eSEric Dumazet .put = fq_codel_put, 6174b549a2eSEric Dumazet .tcf_chain = fq_codel_find_tcf, 6184b549a2eSEric Dumazet .bind_tcf = fq_codel_bind, 6194b549a2eSEric Dumazet .unbind_tcf = fq_codel_put, 6204b549a2eSEric Dumazet .dump = fq_codel_dump_class, 6214b549a2eSEric Dumazet .dump_stats = fq_codel_dump_class_stats, 6224b549a2eSEric Dumazet .walk = fq_codel_walk, 6234b549a2eSEric Dumazet }; 6244b549a2eSEric Dumazet 6254b549a2eSEric Dumazet static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { 6264b549a2eSEric Dumazet .cl_ops = &fq_codel_class_ops, 6274b549a2eSEric Dumazet .id = "fq_codel", 6284b549a2eSEric Dumazet .priv_size = sizeof(struct fq_codel_sched_data), 6294b549a2eSEric Dumazet .enqueue = fq_codel_enqueue, 6304b549a2eSEric Dumazet .dequeue = fq_codel_dequeue, 6314b549a2eSEric Dumazet .peek = qdisc_peek_dequeued, 632c0afd9ceSWANG Cong .drop = fq_codel_qdisc_drop, 6334b549a2eSEric Dumazet .init = fq_codel_init, 6344b549a2eSEric Dumazet .reset = fq_codel_reset, 6354b549a2eSEric Dumazet .destroy = fq_codel_destroy, 6364b549a2eSEric Dumazet .change = fq_codel_change, 6374b549a2eSEric Dumazet .dump = fq_codel_dump, 6384b549a2eSEric Dumazet .dump_stats = fq_codel_dump_stats, 6394b549a2eSEric Dumazet .owner = THIS_MODULE, 6404b549a2eSEric Dumazet }; 6414b549a2eSEric Dumazet 6424b549a2eSEric Dumazet static int __init fq_codel_module_init(void) 6434b549a2eSEric Dumazet { 6444b549a2eSEric Dumazet return register_qdisc(&fq_codel_qdisc_ops); 6454b549a2eSEric Dumazet } 6464b549a2eSEric Dumazet 6474b549a2eSEric Dumazet static void __exit fq_codel_module_exit(void) 6484b549a2eSEric Dumazet { 6494b549a2eSEric Dumazet unregister_qdisc(&fq_codel_qdisc_ops); 6504b549a2eSEric Dumazet } 6514b549a2eSEric Dumazet 6524b549a2eSEric Dumazet module_init(fq_codel_module_init) 6534b549a2eSEric Dumazet module_exit(fq_codel_module_exit) 6544b549a2eSEric Dumazet MODULE_AUTHOR("Eric Dumazet"); 6554b549a2eSEric Dumazet MODULE_LICENSE("GPL"); 656