14b549a2eSEric Dumazet /* 24b549a2eSEric Dumazet * Fair Queue CoDel discipline 34b549a2eSEric Dumazet * 44b549a2eSEric Dumazet * This program is free software; you can redistribute it and/or 54b549a2eSEric Dumazet * modify it under the terms of the GNU General Public License 64b549a2eSEric Dumazet * as published by the Free Software Foundation; either version 74b549a2eSEric Dumazet * 2 of the License, or (at your option) any later version. 84b549a2eSEric Dumazet * 980ba92faSEric Dumazet * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> 104b549a2eSEric Dumazet */ 114b549a2eSEric Dumazet 124b549a2eSEric Dumazet #include <linux/module.h> 134b549a2eSEric Dumazet #include <linux/types.h> 144b549a2eSEric Dumazet #include <linux/kernel.h> 154b549a2eSEric Dumazet #include <linux/jiffies.h> 164b549a2eSEric Dumazet #include <linux/string.h> 174b549a2eSEric Dumazet #include <linux/in.h> 184b549a2eSEric Dumazet #include <linux/errno.h> 194b549a2eSEric Dumazet #include <linux/init.h> 204b549a2eSEric Dumazet #include <linux/skbuff.h> 214b549a2eSEric Dumazet #include <linux/jhash.h> 224b549a2eSEric Dumazet #include <linux/slab.h> 234b549a2eSEric Dumazet #include <linux/vmalloc.h> 244b549a2eSEric Dumazet #include <net/netlink.h> 254b549a2eSEric Dumazet #include <net/pkt_sched.h> 264b549a2eSEric Dumazet #include <net/codel.h> 274b549a2eSEric Dumazet 284b549a2eSEric Dumazet /* Fair Queue CoDel. 294b549a2eSEric Dumazet * 304b549a2eSEric Dumazet * Principles : 314b549a2eSEric Dumazet * Packets are classified (internal classifier or external) on flows. 324b549a2eSEric Dumazet * This is a Stochastic model (as we use a hash, several flows 334b549a2eSEric Dumazet * might be hashed on same slot) 344b549a2eSEric Dumazet * Each flow has a CoDel managed queue. 354b549a2eSEric Dumazet * Flows are linked onto two (Round Robin) lists, 364b549a2eSEric Dumazet * so that new flows have priority on old ones. 374b549a2eSEric Dumazet * 384b549a2eSEric Dumazet * For a given flow, packets are not reordered (CoDel uses a FIFO) 394b549a2eSEric Dumazet * head drops only. 404b549a2eSEric Dumazet * ECN capability is on by default. 414b549a2eSEric Dumazet * Low memory footprint (64 bytes per flow) 424b549a2eSEric Dumazet */ 434b549a2eSEric Dumazet 444b549a2eSEric Dumazet struct fq_codel_flow { 454b549a2eSEric Dumazet struct sk_buff *head; 464b549a2eSEric Dumazet struct sk_buff *tail; 474b549a2eSEric Dumazet struct list_head flowchain; 484b549a2eSEric Dumazet int deficit; 494b549a2eSEric Dumazet u32 dropped; /* number of drops (or ECN marks) on this flow */ 504b549a2eSEric Dumazet struct codel_vars cvars; 514b549a2eSEric Dumazet }; /* please try to keep this structure <= 64 bytes */ 524b549a2eSEric Dumazet 534b549a2eSEric Dumazet struct fq_codel_sched_data { 5425d8c0d5SJohn Fastabend struct tcf_proto __rcu *filter_list; /* optional external classifier */ 554b549a2eSEric Dumazet struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ 564b549a2eSEric Dumazet u32 *backlogs; /* backlog table [flows_cnt] */ 574b549a2eSEric Dumazet u32 flows_cnt; /* number of flows */ 584b549a2eSEric Dumazet u32 perturbation; /* hash perturbation */ 594b549a2eSEric Dumazet u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ 604b549a2eSEric Dumazet struct codel_params cparams; 614b549a2eSEric Dumazet struct codel_stats cstats; 624b549a2eSEric Dumazet u32 drop_overlimit; 634b549a2eSEric Dumazet u32 new_flow_count; 644b549a2eSEric Dumazet 654b549a2eSEric Dumazet struct list_head new_flows; /* list of new flows */ 664b549a2eSEric Dumazet struct list_head old_flows; /* list of old flows */ 674b549a2eSEric Dumazet }; 684b549a2eSEric Dumazet 694b549a2eSEric Dumazet static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, 70342db221STom Herbert struct sk_buff *skb) 714b549a2eSEric Dumazet { 72342db221STom Herbert u32 hash = skb_get_hash_perturb(skb, q->perturbation); 738fc54f68SDaniel Borkmann 748fc54f68SDaniel Borkmann return reciprocal_scale(hash, q->flows_cnt); 754b549a2eSEric Dumazet } 764b549a2eSEric Dumazet 774b549a2eSEric Dumazet static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, 784b549a2eSEric Dumazet int *qerr) 794b549a2eSEric Dumazet { 804b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 8125d8c0d5SJohn Fastabend struct tcf_proto *filter; 824b549a2eSEric Dumazet struct tcf_result res; 834b549a2eSEric Dumazet int result; 844b549a2eSEric Dumazet 854b549a2eSEric Dumazet if (TC_H_MAJ(skb->priority) == sch->handle && 864b549a2eSEric Dumazet TC_H_MIN(skb->priority) > 0 && 874b549a2eSEric Dumazet TC_H_MIN(skb->priority) <= q->flows_cnt) 884b549a2eSEric Dumazet return TC_H_MIN(skb->priority); 894b549a2eSEric Dumazet 9069204cf7SValdis.Kletnieks@vt.edu filter = rcu_dereference_bh(q->filter_list); 9125d8c0d5SJohn Fastabend if (!filter) 924b549a2eSEric Dumazet return fq_codel_hash(q, skb) + 1; 934b549a2eSEric Dumazet 944b549a2eSEric Dumazet *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 9525d8c0d5SJohn Fastabend result = tc_classify(skb, filter, &res); 964b549a2eSEric Dumazet if (result >= 0) { 974b549a2eSEric Dumazet #ifdef CONFIG_NET_CLS_ACT 984b549a2eSEric Dumazet switch (result) { 994b549a2eSEric Dumazet case TC_ACT_STOLEN: 1004b549a2eSEric Dumazet case TC_ACT_QUEUED: 1014b549a2eSEric Dumazet *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1024b549a2eSEric Dumazet case TC_ACT_SHOT: 1034b549a2eSEric Dumazet return 0; 1044b549a2eSEric Dumazet } 1054b549a2eSEric Dumazet #endif 1064b549a2eSEric Dumazet if (TC_H_MIN(res.classid) <= q->flows_cnt) 1074b549a2eSEric Dumazet return TC_H_MIN(res.classid); 1084b549a2eSEric Dumazet } 1094b549a2eSEric Dumazet return 0; 1104b549a2eSEric Dumazet } 1114b549a2eSEric Dumazet 1124b549a2eSEric Dumazet /* helper functions : might be changed when/if skb use a standard list_head */ 1134b549a2eSEric Dumazet 1144b549a2eSEric Dumazet /* remove one skb from head of slot queue */ 1154b549a2eSEric Dumazet static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) 1164b549a2eSEric Dumazet { 1174b549a2eSEric Dumazet struct sk_buff *skb = flow->head; 1184b549a2eSEric Dumazet 1194b549a2eSEric Dumazet flow->head = skb->next; 1204b549a2eSEric Dumazet skb->next = NULL; 1214b549a2eSEric Dumazet return skb; 1224b549a2eSEric Dumazet } 1234b549a2eSEric Dumazet 1244b549a2eSEric Dumazet /* add skb to flow queue (tail add) */ 1254b549a2eSEric Dumazet static inline void flow_queue_add(struct fq_codel_flow *flow, 1264b549a2eSEric Dumazet struct sk_buff *skb) 1274b549a2eSEric Dumazet { 1284b549a2eSEric Dumazet if (flow->head == NULL) 1294b549a2eSEric Dumazet flow->head = skb; 1304b549a2eSEric Dumazet else 1314b549a2eSEric Dumazet flow->tail->next = skb; 1324b549a2eSEric Dumazet flow->tail = skb; 1334b549a2eSEric Dumazet skb->next = NULL; 1344b549a2eSEric Dumazet } 1354b549a2eSEric Dumazet 1364b549a2eSEric Dumazet static unsigned int fq_codel_drop(struct Qdisc *sch) 1374b549a2eSEric Dumazet { 1384b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 1394b549a2eSEric Dumazet struct sk_buff *skb; 1404b549a2eSEric Dumazet unsigned int maxbacklog = 0, idx = 0, i, len; 1414b549a2eSEric Dumazet struct fq_codel_flow *flow; 1424b549a2eSEric Dumazet 1434b549a2eSEric Dumazet /* Queue is full! Find the fat flow and drop packet from it. 1444b549a2eSEric Dumazet * This might sound expensive, but with 1024 flows, we scan 1454b549a2eSEric Dumazet * 4KB of memory, and we dont need to handle a complex tree 1464b549a2eSEric Dumazet * in fast path (packet queue/enqueue) with many cache misses. 1474b549a2eSEric Dumazet */ 1484b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 1494b549a2eSEric Dumazet if (q->backlogs[i] > maxbacklog) { 1504b549a2eSEric Dumazet maxbacklog = q->backlogs[i]; 1514b549a2eSEric Dumazet idx = i; 1524b549a2eSEric Dumazet } 1534b549a2eSEric Dumazet } 1544b549a2eSEric Dumazet flow = &q->flows[idx]; 1554b549a2eSEric Dumazet skb = dequeue_head(flow); 1564b549a2eSEric Dumazet len = qdisc_pkt_len(skb); 1574b549a2eSEric Dumazet q->backlogs[idx] -= len; 1584b549a2eSEric Dumazet sch->q.qlen--; 15925331d6cSJohn Fastabend qdisc_qstats_drop(sch); 16025331d6cSJohn Fastabend qdisc_qstats_backlog_dec(sch, skb); 161052cbda4SWANG Cong kfree_skb(skb); 1624b549a2eSEric Dumazet flow->dropped++; 1634b549a2eSEric Dumazet return idx; 1644b549a2eSEric Dumazet } 1654b549a2eSEric Dumazet 1664b549a2eSEric Dumazet static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) 1674b549a2eSEric Dumazet { 1684b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 1694b549a2eSEric Dumazet unsigned int idx; 1704b549a2eSEric Dumazet struct fq_codel_flow *flow; 1714b549a2eSEric Dumazet int uninitialized_var(ret); 1724b549a2eSEric Dumazet 1734b549a2eSEric Dumazet idx = fq_codel_classify(skb, sch, &ret); 1744b549a2eSEric Dumazet if (idx == 0) { 1754b549a2eSEric Dumazet if (ret & __NET_XMIT_BYPASS) 17625331d6cSJohn Fastabend qdisc_qstats_drop(sch); 1774b549a2eSEric Dumazet kfree_skb(skb); 1784b549a2eSEric Dumazet return ret; 1794b549a2eSEric Dumazet } 1804b549a2eSEric Dumazet idx--; 1814b549a2eSEric Dumazet 1824b549a2eSEric Dumazet codel_set_enqueue_time(skb); 1834b549a2eSEric Dumazet flow = &q->flows[idx]; 1844b549a2eSEric Dumazet flow_queue_add(flow, skb); 1854b549a2eSEric Dumazet q->backlogs[idx] += qdisc_pkt_len(skb); 18625331d6cSJohn Fastabend qdisc_qstats_backlog_inc(sch, skb); 1874b549a2eSEric Dumazet 1884b549a2eSEric Dumazet if (list_empty(&flow->flowchain)) { 1894b549a2eSEric Dumazet list_add_tail(&flow->flowchain, &q->new_flows); 1904b549a2eSEric Dumazet q->new_flow_count++; 1914b549a2eSEric Dumazet flow->deficit = q->quantum; 1924b549a2eSEric Dumazet flow->dropped = 0; 1934b549a2eSEric Dumazet } 194cd68ddd4SVijay Subramanian if (++sch->q.qlen <= sch->limit) 1954b549a2eSEric Dumazet return NET_XMIT_SUCCESS; 1964b549a2eSEric Dumazet 1974b549a2eSEric Dumazet q->drop_overlimit++; 1984b549a2eSEric Dumazet /* Return Congestion Notification only if we dropped a packet 1994b549a2eSEric Dumazet * from this flow. 2004b549a2eSEric Dumazet */ 2014b549a2eSEric Dumazet if (fq_codel_drop(sch) == idx) 2024b549a2eSEric Dumazet return NET_XMIT_CN; 2034b549a2eSEric Dumazet 2044b549a2eSEric Dumazet /* As we dropped a packet, better let upper stack know this */ 2054b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, 1); 2064b549a2eSEric Dumazet return NET_XMIT_SUCCESS; 2074b549a2eSEric Dumazet } 2084b549a2eSEric Dumazet 2094b549a2eSEric Dumazet /* This is the specific function called from codel_dequeue() 2104b549a2eSEric Dumazet * to dequeue a packet from queue. Note: backlog is handled in 2114b549a2eSEric Dumazet * codel, we dont need to reduce it here. 2124b549a2eSEric Dumazet */ 2134b549a2eSEric Dumazet static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) 2144b549a2eSEric Dumazet { 215865ec552SEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 2164b549a2eSEric Dumazet struct fq_codel_flow *flow; 2174b549a2eSEric Dumazet struct sk_buff *skb = NULL; 2184b549a2eSEric Dumazet 2194b549a2eSEric Dumazet flow = container_of(vars, struct fq_codel_flow, cvars); 2204b549a2eSEric Dumazet if (flow->head) { 2214b549a2eSEric Dumazet skb = dequeue_head(flow); 222865ec552SEric Dumazet q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); 2234b549a2eSEric Dumazet sch->q.qlen--; 2244b549a2eSEric Dumazet } 2254b549a2eSEric Dumazet return skb; 2264b549a2eSEric Dumazet } 2274b549a2eSEric Dumazet 2284b549a2eSEric Dumazet static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) 2294b549a2eSEric Dumazet { 2304b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 2314b549a2eSEric Dumazet struct sk_buff *skb; 2324b549a2eSEric Dumazet struct fq_codel_flow *flow; 2334b549a2eSEric Dumazet struct list_head *head; 2344b549a2eSEric Dumazet u32 prev_drop_count, prev_ecn_mark; 2354b549a2eSEric Dumazet 2364b549a2eSEric Dumazet begin: 2374b549a2eSEric Dumazet head = &q->new_flows; 2384b549a2eSEric Dumazet if (list_empty(head)) { 2394b549a2eSEric Dumazet head = &q->old_flows; 2404b549a2eSEric Dumazet if (list_empty(head)) 2414b549a2eSEric Dumazet return NULL; 2424b549a2eSEric Dumazet } 2434b549a2eSEric Dumazet flow = list_first_entry(head, struct fq_codel_flow, flowchain); 2444b549a2eSEric Dumazet 2454b549a2eSEric Dumazet if (flow->deficit <= 0) { 2464b549a2eSEric Dumazet flow->deficit += q->quantum; 2474b549a2eSEric Dumazet list_move_tail(&flow->flowchain, &q->old_flows); 2484b549a2eSEric Dumazet goto begin; 2494b549a2eSEric Dumazet } 2504b549a2eSEric Dumazet 2514b549a2eSEric Dumazet prev_drop_count = q->cstats.drop_count; 2524b549a2eSEric Dumazet prev_ecn_mark = q->cstats.ecn_mark; 2534b549a2eSEric Dumazet 2544b549a2eSEric Dumazet skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, 255865ec552SEric Dumazet dequeue); 2564b549a2eSEric Dumazet 2574b549a2eSEric Dumazet flow->dropped += q->cstats.drop_count - prev_drop_count; 2584b549a2eSEric Dumazet flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; 2594b549a2eSEric Dumazet 2604b549a2eSEric Dumazet if (!skb) { 2614b549a2eSEric Dumazet /* force a pass through old_flows to prevent starvation */ 2624b549a2eSEric Dumazet if ((head == &q->new_flows) && !list_empty(&q->old_flows)) 2634b549a2eSEric Dumazet list_move_tail(&flow->flowchain, &q->old_flows); 2644b549a2eSEric Dumazet else 2654b549a2eSEric Dumazet list_del_init(&flow->flowchain); 2664b549a2eSEric Dumazet goto begin; 2674b549a2eSEric Dumazet } 2684b549a2eSEric Dumazet qdisc_bstats_update(sch, skb); 2694b549a2eSEric Dumazet flow->deficit -= qdisc_pkt_len(skb); 2704b549a2eSEric Dumazet /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, 2714b549a2eSEric Dumazet * or HTB crashes. Defer it for next round. 2724b549a2eSEric Dumazet */ 2734b549a2eSEric Dumazet if (q->cstats.drop_count && sch->q.qlen) { 2744b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 2754b549a2eSEric Dumazet q->cstats.drop_count = 0; 2764b549a2eSEric Dumazet } 2774b549a2eSEric Dumazet return skb; 2784b549a2eSEric Dumazet } 2794b549a2eSEric Dumazet 2804b549a2eSEric Dumazet static void fq_codel_reset(struct Qdisc *sch) 2814b549a2eSEric Dumazet { 2824b549a2eSEric Dumazet struct sk_buff *skb; 2834b549a2eSEric Dumazet 2844b549a2eSEric Dumazet while ((skb = fq_codel_dequeue(sch)) != NULL) 2854b549a2eSEric Dumazet kfree_skb(skb); 2864b549a2eSEric Dumazet } 2874b549a2eSEric Dumazet 2884b549a2eSEric Dumazet static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { 2894b549a2eSEric Dumazet [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, 2904b549a2eSEric Dumazet [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, 2914b549a2eSEric Dumazet [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, 2924b549a2eSEric Dumazet [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, 2934b549a2eSEric Dumazet [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, 2944b549a2eSEric Dumazet [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, 29580ba92faSEric Dumazet [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, 2964b549a2eSEric Dumazet }; 2974b549a2eSEric Dumazet 2984b549a2eSEric Dumazet static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) 2994b549a2eSEric Dumazet { 3004b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 3014b549a2eSEric Dumazet struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; 3024b549a2eSEric Dumazet int err; 3034b549a2eSEric Dumazet 3044b549a2eSEric Dumazet if (!opt) 3054b549a2eSEric Dumazet return -EINVAL; 3064b549a2eSEric Dumazet 3074b549a2eSEric Dumazet err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); 3084b549a2eSEric Dumazet if (err < 0) 3094b549a2eSEric Dumazet return err; 3104b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_FLOWS]) { 3114b549a2eSEric Dumazet if (q->flows) 3124b549a2eSEric Dumazet return -EINVAL; 3134b549a2eSEric Dumazet q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); 3144b549a2eSEric Dumazet if (!q->flows_cnt || 3154b549a2eSEric Dumazet q->flows_cnt > 65536) 3164b549a2eSEric Dumazet return -EINVAL; 3174b549a2eSEric Dumazet } 3184b549a2eSEric Dumazet sch_tree_lock(sch); 3194b549a2eSEric Dumazet 3204b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_TARGET]) { 3214b549a2eSEric Dumazet u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); 3224b549a2eSEric Dumazet 3234b549a2eSEric Dumazet q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; 3244b549a2eSEric Dumazet } 3254b549a2eSEric Dumazet 32680ba92faSEric Dumazet if (tb[TCA_FQ_CODEL_CE_THRESHOLD]) { 32780ba92faSEric Dumazet u64 val = nla_get_u32(tb[TCA_FQ_CODEL_CE_THRESHOLD]); 32880ba92faSEric Dumazet 32980ba92faSEric Dumazet q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; 33080ba92faSEric Dumazet } 33180ba92faSEric Dumazet 3324b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_INTERVAL]) { 3334b549a2eSEric Dumazet u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); 3344b549a2eSEric Dumazet 3354b549a2eSEric Dumazet q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; 3364b549a2eSEric Dumazet } 3374b549a2eSEric Dumazet 3384b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_LIMIT]) 3394b549a2eSEric Dumazet sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); 3404b549a2eSEric Dumazet 3414b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_ECN]) 3424b549a2eSEric Dumazet q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); 3434b549a2eSEric Dumazet 3444b549a2eSEric Dumazet if (tb[TCA_FQ_CODEL_QUANTUM]) 3454b549a2eSEric Dumazet q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); 3464b549a2eSEric Dumazet 3474b549a2eSEric Dumazet while (sch->q.qlen > sch->limit) { 3484b549a2eSEric Dumazet struct sk_buff *skb = fq_codel_dequeue(sch); 3494b549a2eSEric Dumazet 3504b549a2eSEric Dumazet kfree_skb(skb); 3514b549a2eSEric Dumazet q->cstats.drop_count++; 3524b549a2eSEric Dumazet } 3534b549a2eSEric Dumazet qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); 3544b549a2eSEric Dumazet q->cstats.drop_count = 0; 3554b549a2eSEric Dumazet 3564b549a2eSEric Dumazet sch_tree_unlock(sch); 3574b549a2eSEric Dumazet return 0; 3584b549a2eSEric Dumazet } 3594b549a2eSEric Dumazet 3604b549a2eSEric Dumazet static void *fq_codel_zalloc(size_t sz) 3614b549a2eSEric Dumazet { 3624b549a2eSEric Dumazet void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); 3634b549a2eSEric Dumazet 3644b549a2eSEric Dumazet if (!ptr) 3654b549a2eSEric Dumazet ptr = vzalloc(sz); 3664b549a2eSEric Dumazet return ptr; 3674b549a2eSEric Dumazet } 3684b549a2eSEric Dumazet 3694b549a2eSEric Dumazet static void fq_codel_free(void *addr) 3704b549a2eSEric Dumazet { 3714cb28970SWANG Cong kvfree(addr); 3724b549a2eSEric Dumazet } 3734b549a2eSEric Dumazet 3744b549a2eSEric Dumazet static void fq_codel_destroy(struct Qdisc *sch) 3754b549a2eSEric Dumazet { 3764b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 3774b549a2eSEric Dumazet 3784b549a2eSEric Dumazet tcf_destroy_chain(&q->filter_list); 3794b549a2eSEric Dumazet fq_codel_free(q->backlogs); 3804b549a2eSEric Dumazet fq_codel_free(q->flows); 3814b549a2eSEric Dumazet } 3824b549a2eSEric Dumazet 3834b549a2eSEric Dumazet static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) 3844b549a2eSEric Dumazet { 3854b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 3864b549a2eSEric Dumazet int i; 3874b549a2eSEric Dumazet 3884b549a2eSEric Dumazet sch->limit = 10*1024; 3894b549a2eSEric Dumazet q->flows_cnt = 1024; 3904b549a2eSEric Dumazet q->quantum = psched_mtu(qdisc_dev(sch)); 39163862b5bSAruna-Hewapathirane q->perturbation = prandom_u32(); 3924b549a2eSEric Dumazet INIT_LIST_HEAD(&q->new_flows); 3934b549a2eSEric Dumazet INIT_LIST_HEAD(&q->old_flows); 394a5d28090SEric Dumazet codel_params_init(&q->cparams, sch); 3954b549a2eSEric Dumazet codel_stats_init(&q->cstats); 3964b549a2eSEric Dumazet q->cparams.ecn = true; 3974b549a2eSEric Dumazet 3984b549a2eSEric Dumazet if (opt) { 3994b549a2eSEric Dumazet int err = fq_codel_change(sch, opt); 4004b549a2eSEric Dumazet if (err) 4014b549a2eSEric Dumazet return err; 4024b549a2eSEric Dumazet } 4034b549a2eSEric Dumazet 4044b549a2eSEric Dumazet if (!q->flows) { 4054b549a2eSEric Dumazet q->flows = fq_codel_zalloc(q->flows_cnt * 4064b549a2eSEric Dumazet sizeof(struct fq_codel_flow)); 4074b549a2eSEric Dumazet if (!q->flows) 4084b549a2eSEric Dumazet return -ENOMEM; 4094b549a2eSEric Dumazet q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); 4104b549a2eSEric Dumazet if (!q->backlogs) { 4114b549a2eSEric Dumazet fq_codel_free(q->flows); 4124b549a2eSEric Dumazet return -ENOMEM; 4134b549a2eSEric Dumazet } 4144b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 4154b549a2eSEric Dumazet struct fq_codel_flow *flow = q->flows + i; 4164b549a2eSEric Dumazet 4174b549a2eSEric Dumazet INIT_LIST_HEAD(&flow->flowchain); 418b379135cSEric Dumazet codel_vars_init(&flow->cvars); 4194b549a2eSEric Dumazet } 4204b549a2eSEric Dumazet } 4214b549a2eSEric Dumazet if (sch->limit >= 1) 4224b549a2eSEric Dumazet sch->flags |= TCQ_F_CAN_BYPASS; 4234b549a2eSEric Dumazet else 4244b549a2eSEric Dumazet sch->flags &= ~TCQ_F_CAN_BYPASS; 4254b549a2eSEric Dumazet return 0; 4264b549a2eSEric Dumazet } 4274b549a2eSEric Dumazet 4284b549a2eSEric Dumazet static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) 4294b549a2eSEric Dumazet { 4304b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4314b549a2eSEric Dumazet struct nlattr *opts; 4324b549a2eSEric Dumazet 4334b549a2eSEric Dumazet opts = nla_nest_start(skb, TCA_OPTIONS); 4344b549a2eSEric Dumazet if (opts == NULL) 4354b549a2eSEric Dumazet goto nla_put_failure; 4364b549a2eSEric Dumazet 4374b549a2eSEric Dumazet if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, 4384b549a2eSEric Dumazet codel_time_to_us(q->cparams.target)) || 4394b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, 4404b549a2eSEric Dumazet sch->limit) || 4414b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, 4424b549a2eSEric Dumazet codel_time_to_us(q->cparams.interval)) || 4434b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_ECN, 4444b549a2eSEric Dumazet q->cparams.ecn) || 4454b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, 4464b549a2eSEric Dumazet q->quantum) || 4474b549a2eSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, 4484b549a2eSEric Dumazet q->flows_cnt)) 4494b549a2eSEric Dumazet goto nla_put_failure; 4504b549a2eSEric Dumazet 45180ba92faSEric Dumazet if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && 45280ba92faSEric Dumazet nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, 45380ba92faSEric Dumazet codel_time_to_us(q->cparams.ce_threshold))) 45480ba92faSEric Dumazet goto nla_put_failure; 45580ba92faSEric Dumazet 456d59b7d80SYang Yingliang return nla_nest_end(skb, opts); 4574b549a2eSEric Dumazet 4584b549a2eSEric Dumazet nla_put_failure: 4594b549a2eSEric Dumazet return -1; 4604b549a2eSEric Dumazet } 4614b549a2eSEric Dumazet 4624b549a2eSEric Dumazet static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 4634b549a2eSEric Dumazet { 4644b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 4654b549a2eSEric Dumazet struct tc_fq_codel_xstats st = { 4664b549a2eSEric Dumazet .type = TCA_FQ_CODEL_XSTATS_QDISC, 4674b549a2eSEric Dumazet }; 4684b549a2eSEric Dumazet struct list_head *pos; 4694b549a2eSEric Dumazet 470669d67bfSSasha Levin st.qdisc_stats.maxpacket = q->cstats.maxpacket; 471669d67bfSSasha Levin st.qdisc_stats.drop_overlimit = q->drop_overlimit; 472669d67bfSSasha Levin st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; 473669d67bfSSasha Levin st.qdisc_stats.new_flow_count = q->new_flow_count; 47480ba92faSEric Dumazet st.qdisc_stats.ce_mark = q->cstats.ce_mark; 475669d67bfSSasha Levin 4764b549a2eSEric Dumazet list_for_each(pos, &q->new_flows) 4774b549a2eSEric Dumazet st.qdisc_stats.new_flows_len++; 4784b549a2eSEric Dumazet 4794b549a2eSEric Dumazet list_for_each(pos, &q->old_flows) 4804b549a2eSEric Dumazet st.qdisc_stats.old_flows_len++; 4814b549a2eSEric Dumazet 4824b549a2eSEric Dumazet return gnet_stats_copy_app(d, &st, sizeof(st)); 4834b549a2eSEric Dumazet } 4844b549a2eSEric Dumazet 4854b549a2eSEric Dumazet static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) 4864b549a2eSEric Dumazet { 4874b549a2eSEric Dumazet return NULL; 4884b549a2eSEric Dumazet } 4894b549a2eSEric Dumazet 4904b549a2eSEric Dumazet static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) 4914b549a2eSEric Dumazet { 4924b549a2eSEric Dumazet return 0; 4934b549a2eSEric Dumazet } 4944b549a2eSEric Dumazet 4954b549a2eSEric Dumazet static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, 4964b549a2eSEric Dumazet u32 classid) 4974b549a2eSEric Dumazet { 4984b549a2eSEric Dumazet /* we cannot bypass queue discipline anymore */ 4994b549a2eSEric Dumazet sch->flags &= ~TCQ_F_CAN_BYPASS; 5004b549a2eSEric Dumazet return 0; 5014b549a2eSEric Dumazet } 5024b549a2eSEric Dumazet 5034b549a2eSEric Dumazet static void fq_codel_put(struct Qdisc *q, unsigned long cl) 5044b549a2eSEric Dumazet { 5054b549a2eSEric Dumazet } 5064b549a2eSEric Dumazet 50725d8c0d5SJohn Fastabend static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch, 50825d8c0d5SJohn Fastabend unsigned long cl) 5094b549a2eSEric Dumazet { 5104b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5114b549a2eSEric Dumazet 5124b549a2eSEric Dumazet if (cl) 5134b549a2eSEric Dumazet return NULL; 5144b549a2eSEric Dumazet return &q->filter_list; 5154b549a2eSEric Dumazet } 5164b549a2eSEric Dumazet 5174b549a2eSEric Dumazet static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, 5184b549a2eSEric Dumazet struct sk_buff *skb, struct tcmsg *tcm) 5194b549a2eSEric Dumazet { 5204b549a2eSEric Dumazet tcm->tcm_handle |= TC_H_MIN(cl); 5214b549a2eSEric Dumazet return 0; 5224b549a2eSEric Dumazet } 5234b549a2eSEric Dumazet 5244b549a2eSEric Dumazet static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, 5254b549a2eSEric Dumazet struct gnet_dump *d) 5264b549a2eSEric Dumazet { 5274b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5284b549a2eSEric Dumazet u32 idx = cl - 1; 5294b549a2eSEric Dumazet struct gnet_stats_queue qs = { 0 }; 5304b549a2eSEric Dumazet struct tc_fq_codel_xstats xstats; 5314b549a2eSEric Dumazet 5324b549a2eSEric Dumazet if (idx < q->flows_cnt) { 5334b549a2eSEric Dumazet const struct fq_codel_flow *flow = &q->flows[idx]; 5344b549a2eSEric Dumazet const struct sk_buff *skb = flow->head; 5354b549a2eSEric Dumazet 5364b549a2eSEric Dumazet memset(&xstats, 0, sizeof(xstats)); 5374b549a2eSEric Dumazet xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; 5384b549a2eSEric Dumazet xstats.class_stats.deficit = flow->deficit; 5394b549a2eSEric Dumazet xstats.class_stats.ldelay = 5404b549a2eSEric Dumazet codel_time_to_us(flow->cvars.ldelay); 5414b549a2eSEric Dumazet xstats.class_stats.count = flow->cvars.count; 5424b549a2eSEric Dumazet xstats.class_stats.lastcount = flow->cvars.lastcount; 5434b549a2eSEric Dumazet xstats.class_stats.dropping = flow->cvars.dropping; 5444b549a2eSEric Dumazet if (flow->cvars.dropping) { 5454b549a2eSEric Dumazet codel_tdiff_t delta = flow->cvars.drop_next - 5464b549a2eSEric Dumazet codel_get_time(); 5474b549a2eSEric Dumazet 5484b549a2eSEric Dumazet xstats.class_stats.drop_next = (delta >= 0) ? 5494b549a2eSEric Dumazet codel_time_to_us(delta) : 5504b549a2eSEric Dumazet -codel_time_to_us(-delta); 5514b549a2eSEric Dumazet } 5524b549a2eSEric Dumazet while (skb) { 5534b549a2eSEric Dumazet qs.qlen++; 5544b549a2eSEric Dumazet skb = skb->next; 5554b549a2eSEric Dumazet } 5564b549a2eSEric Dumazet qs.backlog = q->backlogs[idx]; 5574b549a2eSEric Dumazet qs.drops = flow->dropped; 5584b549a2eSEric Dumazet } 559b0ab6f92SJohn Fastabend if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) 5604b549a2eSEric Dumazet return -1; 5614b549a2eSEric Dumazet if (idx < q->flows_cnt) 5624b549a2eSEric Dumazet return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); 5634b549a2eSEric Dumazet return 0; 5644b549a2eSEric Dumazet } 5654b549a2eSEric Dumazet 5664b549a2eSEric Dumazet static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) 5674b549a2eSEric Dumazet { 5684b549a2eSEric Dumazet struct fq_codel_sched_data *q = qdisc_priv(sch); 5694b549a2eSEric Dumazet unsigned int i; 5704b549a2eSEric Dumazet 5714b549a2eSEric Dumazet if (arg->stop) 5724b549a2eSEric Dumazet return; 5734b549a2eSEric Dumazet 5744b549a2eSEric Dumazet for (i = 0; i < q->flows_cnt; i++) { 5754b549a2eSEric Dumazet if (list_empty(&q->flows[i].flowchain) || 5764b549a2eSEric Dumazet arg->count < arg->skip) { 5774b549a2eSEric Dumazet arg->count++; 5784b549a2eSEric Dumazet continue; 5794b549a2eSEric Dumazet } 5804b549a2eSEric Dumazet if (arg->fn(sch, i + 1, arg) < 0) { 5814b549a2eSEric Dumazet arg->stop = 1; 5824b549a2eSEric Dumazet break; 5834b549a2eSEric Dumazet } 5844b549a2eSEric Dumazet arg->count++; 5854b549a2eSEric Dumazet } 5864b549a2eSEric Dumazet } 5874b549a2eSEric Dumazet 5884b549a2eSEric Dumazet static const struct Qdisc_class_ops fq_codel_class_ops = { 5894b549a2eSEric Dumazet .leaf = fq_codel_leaf, 5904b549a2eSEric Dumazet .get = fq_codel_get, 5914b549a2eSEric Dumazet .put = fq_codel_put, 5924b549a2eSEric Dumazet .tcf_chain = fq_codel_find_tcf, 5934b549a2eSEric Dumazet .bind_tcf = fq_codel_bind, 5944b549a2eSEric Dumazet .unbind_tcf = fq_codel_put, 5954b549a2eSEric Dumazet .dump = fq_codel_dump_class, 5964b549a2eSEric Dumazet .dump_stats = fq_codel_dump_class_stats, 5974b549a2eSEric Dumazet .walk = fq_codel_walk, 5984b549a2eSEric Dumazet }; 5994b549a2eSEric Dumazet 6004b549a2eSEric Dumazet static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { 6014b549a2eSEric Dumazet .cl_ops = &fq_codel_class_ops, 6024b549a2eSEric Dumazet .id = "fq_codel", 6034b549a2eSEric Dumazet .priv_size = sizeof(struct fq_codel_sched_data), 6044b549a2eSEric Dumazet .enqueue = fq_codel_enqueue, 6054b549a2eSEric Dumazet .dequeue = fq_codel_dequeue, 6064b549a2eSEric Dumazet .peek = qdisc_peek_dequeued, 6074b549a2eSEric Dumazet .drop = fq_codel_drop, 6084b549a2eSEric Dumazet .init = fq_codel_init, 6094b549a2eSEric Dumazet .reset = fq_codel_reset, 6104b549a2eSEric Dumazet .destroy = fq_codel_destroy, 6114b549a2eSEric Dumazet .change = fq_codel_change, 6124b549a2eSEric Dumazet .dump = fq_codel_dump, 6134b549a2eSEric Dumazet .dump_stats = fq_codel_dump_stats, 6144b549a2eSEric Dumazet .owner = THIS_MODULE, 6154b549a2eSEric Dumazet }; 6164b549a2eSEric Dumazet 6174b549a2eSEric Dumazet static int __init fq_codel_module_init(void) 6184b549a2eSEric Dumazet { 6194b549a2eSEric Dumazet return register_qdisc(&fq_codel_qdisc_ops); 6204b549a2eSEric Dumazet } 6214b549a2eSEric Dumazet 6224b549a2eSEric Dumazet static void __exit fq_codel_module_exit(void) 6234b549a2eSEric Dumazet { 6244b549a2eSEric Dumazet unregister_qdisc(&fq_codel_qdisc_ops); 6254b549a2eSEric Dumazet } 6264b549a2eSEric Dumazet 6274b549a2eSEric Dumazet module_init(fq_codel_module_init) 6284b549a2eSEric Dumazet module_exit(fq_codel_module_exit) 6294b549a2eSEric Dumazet MODULE_AUTHOR("Eric Dumazet"); 6304b549a2eSEric Dumazet MODULE_LICENSE("GPL"); 631