1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 * (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20 * eliminating the need for any sort of burst parameter (eg. token bucket
21 * depth). Burst support is limited to that necessary to overcome scheduling
22 * latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 * up to a specified fraction of bandwidth. Above that bandwidth threshold,
26 * the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 * flows from each other. This prevents a burst on one flow from increasing
30 * the delay to another. Flows are distributed to queues using a
31 * set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals
35 * congestion early via ECN (if available) and/or packet drops, to keep
36 * latency low. The codel parameters are auto-tuned based on the bandwidth
37 * setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults. Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating. This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83
84 /* struct cobalt_params - contains codel and blue parameters
85 * @interval: codel initial drop rate
86 * @target: maximum persistent sojourn time & blue update rate
87 * @mtu_time: serialisation delay of maximum-size packet
88 * @p_inc: increment of blue drop probability (0.32 fxp)
89 * @p_dec: decrement of blue drop probability (0.32 fxp)
90 */
91 struct cobalt_params {
92 u64 interval;
93 u64 target;
94 u64 mtu_time;
95 u32 p_inc;
96 u32 p_dec;
97 };
98
99 /* struct cobalt_vars - contains codel and blue variables
100 * @count: codel dropping frequency
101 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
102 * @drop_next: time to drop next packet, or when we dropped last
103 * @blue_timer: Blue time to next drop
104 * @p_drop: BLUE drop probability (0.32 fxp)
105 * @dropping: set if in dropping state
106 * @ecn_marked: set if marked
107 */
108 struct cobalt_vars {
109 u32 count;
110 u32 rec_inv_sqrt;
111 ktime_t drop_next;
112 ktime_t blue_timer;
113 u32 p_drop;
114 bool dropping;
115 bool ecn_marked;
116 };
117
118 enum {
119 CAKE_SET_NONE = 0,
120 CAKE_SET_SPARSE,
121 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 CAKE_SET_BULK,
123 CAKE_SET_DECAYING
124 };
125
126 struct cake_flow {
127 /* this stuff is all needed per-flow at dequeue time */
128 struct sk_buff *head;
129 struct sk_buff *tail;
130 struct list_head flowchain;
131 s32 deficit;
132 u32 dropped;
133 struct cobalt_vars cvars;
134 u16 srchost; /* index into cake_host table */
135 u16 dsthost;
136 u8 set;
137 }; /* please try to keep this structure <= 64 bytes */
138
139 struct cake_host {
140 u32 srchost_tag;
141 u32 dsthost_tag;
142 u16 srchost_bulk_flow_count;
143 u16 dsthost_bulk_flow_count;
144 };
145
146 struct cake_heap_entry {
147 u16 t:3, b:10;
148 };
149
150 struct cake_tin_data {
151 struct cake_flow flows[CAKE_QUEUES];
152 u32 backlogs[CAKE_QUEUES];
153 u32 tags[CAKE_QUEUES]; /* for set association */
154 u16 overflow_idx[CAKE_QUEUES];
155 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 u16 flow_quantum;
157
158 struct cobalt_params cparams;
159 u32 drop_overlimit;
160 u16 bulk_flow_count;
161 u16 sparse_flow_count;
162 u16 decaying_flow_count;
163 u16 unresponsive_flow_count;
164
165 u32 max_skblen;
166
167 struct list_head new_flows;
168 struct list_head old_flows;
169 struct list_head decaying_flows;
170
171 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 ktime_t time_next_packet;
173 u64 tin_rate_ns;
174 u64 tin_rate_bps;
175 u16 tin_rate_shft;
176
177 u16 tin_quantum;
178 s32 tin_deficit;
179 u32 tin_backlog;
180 u32 tin_dropped;
181 u32 tin_ecn_mark;
182
183 u32 packets;
184 u64 bytes;
185
186 u32 ack_drops;
187
188 /* moving averages */
189 u64 avge_delay;
190 u64 peak_delay;
191 u64 base_delay;
192
193 /* hash function stats */
194 u32 way_directs;
195 u32 way_hits;
196 u32 way_misses;
197 u32 way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199
200 struct cake_sched_data {
201 struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 struct tcf_block *block;
203 struct cake_tin_data *tins;
204
205 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 u16 overflow_timeout;
207
208 u16 tin_cnt;
209 u8 tin_mode;
210 u8 flow_mode;
211 u8 ack_filter;
212 u8 atm_mode;
213
214 u32 fwmark_mask;
215 u16 fwmark_shft;
216
217 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 u16 rate_shft;
219 ktime_t time_next_packet;
220 ktime_t failsafe_next_packet;
221 u64 rate_ns;
222 u64 rate_bps;
223 u16 rate_flags;
224 s16 rate_overhead;
225 u16 rate_mpu;
226 u64 interval;
227 u64 target;
228
229 /* resource tracking */
230 u32 buffer_used;
231 u32 buffer_max_used;
232 u32 buffer_limit;
233 u32 buffer_config_limit;
234
235 /* indices for dequeue */
236 u16 cur_tin;
237 u16 cur_flow;
238
239 struct qdisc_watchdog watchdog;
240 const u8 *tin_index;
241 const u8 *tin_order;
242
243 /* bandwidth capacity estimate */
244 ktime_t last_packet_time;
245 ktime_t avg_window_begin;
246 u64 avg_packet_interval;
247 u64 avg_window_bytes;
248 u64 avg_peak_bandwidth;
249 ktime_t last_reconfig_time;
250
251 /* packet length stats */
252 u32 avg_netoff;
253 u16 max_netlen;
254 u16 max_adjlen;
255 u16 min_netlen;
256 u16 min_adjlen;
257 };
258
259 enum {
260 CAKE_FLAG_OVERHEAD = BIT(0),
261 CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 CAKE_FLAG_INGRESS = BIT(2),
263 CAKE_FLAG_WASH = BIT(3),
264 CAKE_FLAG_SPLIT_GSO = BIT(4)
265 };
266
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268 * obtain the best features of each. Codel is excellent on flows which
269 * respond to congestion signals in a TCP-like way. BLUE is more effective on
270 * unresponsive flows.
271 */
272
273 struct cobalt_skb_cb {
274 ktime_t enqueue_time;
275 u32 adjusted_len;
276 };
277
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 return us * NSEC_PER_USEC;
281 }
282
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 return get_cobalt_cb(skb)->enqueue_time;
292 }
293
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 ktime_t now)
296 {
297 get_cobalt_cb(skb)->enqueue_time = now;
298 }
299
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301
302 /* Diffserv lookup tables */
303
304 static const u8 precedence[] = {
305 0, 0, 0, 0, 0, 0, 0, 0,
306 1, 1, 1, 1, 1, 1, 1, 1,
307 2, 2, 2, 2, 2, 2, 2, 2,
308 3, 3, 3, 3, 3, 3, 3, 3,
309 4, 4, 4, 4, 4, 4, 4, 4,
310 5, 5, 5, 5, 5, 5, 5, 5,
311 6, 6, 6, 6, 6, 6, 6, 6,
312 7, 7, 7, 7, 7, 7, 7, 7,
313 };
314
315 static const u8 diffserv8[] = {
316 2, 0, 1, 2, 4, 2, 2, 2,
317 1, 2, 1, 2, 1, 2, 1, 2,
318 5, 2, 4, 2, 4, 2, 4, 2,
319 3, 2, 3, 2, 3, 2, 3, 2,
320 6, 2, 3, 2, 3, 2, 3, 2,
321 6, 2, 2, 2, 6, 2, 6, 2,
322 7, 2, 2, 2, 2, 2, 2, 2,
323 7, 2, 2, 2, 2, 2, 2, 2,
324 };
325
326 static const u8 diffserv4[] = {
327 0, 1, 0, 0, 2, 0, 0, 0,
328 1, 0, 0, 0, 0, 0, 0, 0,
329 2, 0, 2, 0, 2, 0, 2, 0,
330 2, 0, 2, 0, 2, 0, 2, 0,
331 3, 0, 2, 0, 2, 0, 2, 0,
332 3, 0, 0, 0, 3, 0, 3, 0,
333 3, 0, 0, 0, 0, 0, 0, 0,
334 3, 0, 0, 0, 0, 0, 0, 0,
335 };
336
337 static const u8 diffserv3[] = {
338 0, 1, 0, 0, 2, 0, 0, 0,
339 1, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 2, 0, 2, 0,
344 2, 0, 0, 0, 0, 0, 0, 0,
345 2, 0, 0, 0, 0, 0, 0, 0,
346 };
347
348 static const u8 besteffort[] = {
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 };
358
359 /* tin priority order for stats dumping */
360
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363
364 #define REC_INV_SQRT_CACHE (16)
365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366
367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369 *
370 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371 */
372
cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars)
374 {
375 u32 invsqrt, invsqrt2;
376 u64 val;
377
378 invsqrt = vars->rec_inv_sqrt;
379 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381
382 val >>= 2; /* avoid overflow in following multiply */
383 val = (val * invsqrt) >> (32 - 2 + 1);
384
385 vars->rec_inv_sqrt = val;
386 }
387
cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars)
389 {
390 if (vars->count < REC_INV_SQRT_CACHE)
391 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392 else
393 cobalt_newton_step(vars);
394 }
395
396 /* There is a big difference in timing between the accurate values placed in
397 * the cache and the approximations given by a single Newton step for small
398 * count values, particularly when stepping from count 1 to 2 or vice versa.
399 * Above 16, a single Newton step gives sufficient accuracy in either
400 * direction, given the precision stored.
401 *
402 * The magnitude of the error when stepping up to count 2 is such as to give
403 * the value that *should* have been produced at count 4.
404 */
405
cobalt_cache_init(void)406 static void cobalt_cache_init(void)
407 {
408 struct cobalt_vars v;
409
410 memset(&v, 0, sizeof(v));
411 v.rec_inv_sqrt = ~0U;
412 cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413
414 for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415 cobalt_newton_step(&v);
416 cobalt_newton_step(&v);
417 cobalt_newton_step(&v);
418 cobalt_newton_step(&v);
419
420 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421 }
422 }
423
cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 memset(vars, 0, sizeof(*vars));
427
428 if (!cobalt_rec_inv_sqrt_cache[0]) {
429 cobalt_cache_init();
430 cobalt_rec_inv_sqrt_cache[0] = ~0;
431 }
432 }
433
434 /* CoDel control_law is t + interval/sqrt(count)
435 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436 * both sqrt() and divide operation.
437 */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t,
439 u64 interval,
440 u32 rec_inv_sqrt)
441 {
442 return ktime_add_ns(t, reciprocal_scale(interval,
443 rec_inv_sqrt));
444 }
445
446 /* Call this when a packet had to be dropped due to queue overflow. Returns
447 * true if the BLUE state was quiescent before but active after this call.
448 */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars,
450 struct cobalt_params *p,
451 ktime_t now)
452 {
453 bool up = false;
454
455 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456 up = !vars->p_drop;
457 vars->p_drop += p->p_inc;
458 if (vars->p_drop < p->p_inc)
459 vars->p_drop = ~0;
460 vars->blue_timer = now;
461 }
462 vars->dropping = true;
463 vars->drop_next = now;
464 if (!vars->count)
465 vars->count = 1;
466
467 return up;
468 }
469
470 /* Call this when the queue was serviced but turned out to be empty. Returns
471 * true if the BLUE state was active before but quiescent after this call.
472 */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
474 struct cobalt_params *p,
475 ktime_t now)
476 {
477 bool down = false;
478
479 if (vars->p_drop &&
480 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481 if (vars->p_drop < p->p_dec)
482 vars->p_drop = 0;
483 else
484 vars->p_drop -= p->p_dec;
485 vars->blue_timer = now;
486 down = !vars->p_drop;
487 }
488 vars->dropping = false;
489
490 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491 vars->count--;
492 cobalt_invsqrt(vars);
493 vars->drop_next = cobalt_control(vars->drop_next,
494 p->interval,
495 vars->rec_inv_sqrt);
496 }
497
498 return down;
499 }
500
501 /* Call this with a freshly dequeued packet for possible congestion marking.
502 * Returns true as an instruction to drop the packet, false for delivery.
503 */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars,
505 struct cobalt_params *p,
506 ktime_t now,
507 struct sk_buff *skb,
508 u32 bulk_flows)
509 {
510 bool next_due, over_target, drop = false;
511 ktime_t schedule;
512 u64 sojourn;
513
514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
515 * after 'drop_next'. This allows 'drop_next' to be updated before the next
516 * scheduling decision is actually branched, without destroying that
517 * information. Similarly, the first 'schedule' value calculated is preserved
518 * in the boolean 'next_due'.
519 *
520 * As for 'drop_next', we take advantage of the fact that 'interval' is both
521 * the delay between first exceeding 'target' and the first signalling event,
522 * *and* the scaling factor for the signalling frequency. It's therefore very
523 * natural to use a single mechanism for both purposes, and eliminates a
524 * significant amount of reference Codel's spaghetti code. To help with this,
525 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526 * as possible to 1.0 in fixed-point.
527 */
528
529 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530 schedule = ktime_sub(now, vars->drop_next);
531 over_target = sojourn > p->target &&
532 sojourn > p->mtu_time * bulk_flows * 2 &&
533 sojourn > p->mtu_time * 4;
534 next_due = vars->count && ktime_to_ns(schedule) >= 0;
535
536 vars->ecn_marked = false;
537
538 if (over_target) {
539 if (!vars->dropping) {
540 vars->dropping = true;
541 vars->drop_next = cobalt_control(now,
542 p->interval,
543 vars->rec_inv_sqrt);
544 }
545 if (!vars->count)
546 vars->count = 1;
547 } else if (vars->dropping) {
548 vars->dropping = false;
549 }
550
551 if (next_due && vars->dropping) {
552 /* Use ECN mark if possible, otherwise drop */
553 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554
555 vars->count++;
556 if (!vars->count)
557 vars->count--;
558 cobalt_invsqrt(vars);
559 vars->drop_next = cobalt_control(vars->drop_next,
560 p->interval,
561 vars->rec_inv_sqrt);
562 schedule = ktime_sub(now, vars->drop_next);
563 } else {
564 while (next_due) {
565 vars->count--;
566 cobalt_invsqrt(vars);
567 vars->drop_next = cobalt_control(vars->drop_next,
568 p->interval,
569 vars->rec_inv_sqrt);
570 schedule = ktime_sub(now, vars->drop_next);
571 next_due = vars->count && ktime_to_ns(schedule) >= 0;
572 }
573 }
574
575 /* Simple BLUE implementation. Lack of ECN is deliberate. */
576 if (vars->p_drop)
577 drop |= (get_random_u32() < vars->p_drop);
578
579 /* Overload the drop_next field as an activity timeout */
580 if (!vars->count)
581 vars->drop_next = ktime_add_ns(now, p->interval);
582 else if (ktime_to_ns(schedule) > 0 && !drop)
583 vars->drop_next = now;
584
585 return drop;
586 }
587
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static bool cake_update_flowkeys(struct flow_keys *keys,
589 const struct sk_buff *skb)
590 {
591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592 struct nf_conntrack_tuple tuple = {};
593 bool rev = !skb->_nfct, upd = false;
594 __be32 ip;
595
596 if (skb_protocol(skb, true) != htons(ETH_P_IP))
597 return false;
598
599 if (!nf_ct_get_tuple_skb(&tuple, skb))
600 return false;
601
602 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603 if (ip != keys->addrs.v4addrs.src) {
604 keys->addrs.v4addrs.src = ip;
605 upd = true;
606 }
607 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608 if (ip != keys->addrs.v4addrs.dst) {
609 keys->addrs.v4addrs.dst = ip;
610 upd = true;
611 }
612
613 if (keys->ports.ports) {
614 __be16 port;
615
616 port = rev ? tuple.dst.u.all : tuple.src.u.all;
617 if (port != keys->ports.src) {
618 keys->ports.src = port;
619 upd = true;
620 }
621 port = rev ? tuple.src.u.all : tuple.dst.u.all;
622 if (port != keys->ports.dst) {
623 port = keys->ports.dst;
624 upd = true;
625 }
626 }
627 return upd;
628 #else
629 return false;
630 #endif
631 }
632
633 /* Cake has several subtle multiple bit settings. In these cases you
634 * would be matching triple isolate mode as well.
635 */
636
cake_dsrc(int flow_mode)637 static bool cake_dsrc(int flow_mode)
638 {
639 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640 }
641
cake_ddst(int flow_mode)642 static bool cake_ddst(int flow_mode)
643 {
644 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645 }
646
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)647 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
648 struct cake_flow *flow,
649 int flow_mode)
650 {
651 if (likely(cake_dsrc(flow_mode) &&
652 q->hosts[flow->srchost].srchost_bulk_flow_count))
653 q->hosts[flow->srchost].srchost_bulk_flow_count--;
654 }
655
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)656 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
657 struct cake_flow *flow,
658 int flow_mode)
659 {
660 if (likely(cake_dsrc(flow_mode) &&
661 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
662 q->hosts[flow->srchost].srchost_bulk_flow_count++;
663 }
664
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)665 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
666 struct cake_flow *flow,
667 int flow_mode)
668 {
669 if (likely(cake_ddst(flow_mode) &&
670 q->hosts[flow->dsthost].dsthost_bulk_flow_count))
671 q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
672 }
673
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)674 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
675 struct cake_flow *flow,
676 int flow_mode)
677 {
678 if (likely(cake_ddst(flow_mode) &&
679 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
680 q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
681 }
682
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)683 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
684 struct cake_flow *flow,
685 int flow_mode)
686 {
687 u16 host_load = 1;
688
689 if (cake_dsrc(flow_mode))
690 host_load = max(host_load,
691 q->hosts[flow->srchost].srchost_bulk_flow_count);
692
693 if (cake_ddst(flow_mode))
694 host_load = max(host_load,
695 q->hosts[flow->dsthost].dsthost_bulk_flow_count);
696
697 /* The get_random_u16() is a way to apply dithering to avoid
698 * accumulating roundoff errors
699 */
700 return (q->flow_quantum * quantum_div[host_load] +
701 get_random_u16()) >> 16;
702 }
703
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)704 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
705 int flow_mode, u16 flow_override, u16 host_override)
706 {
707 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
708 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
709 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
710 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
711 u16 reduced_hash, srchost_idx, dsthost_idx;
712 struct flow_keys keys, host_keys;
713 bool use_skbhash = skb->l4_hash;
714
715 if (unlikely(flow_mode == CAKE_FLOW_NONE))
716 return 0;
717
718 /* If both overrides are set, or we can use the SKB hash and nat mode is
719 * disabled, we can skip packet dissection entirely. If nat mode is
720 * enabled there's another check below after doing the conntrack lookup.
721 */
722 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
723 goto skip_hash;
724
725 skb_flow_dissect_flow_keys(skb, &keys,
726 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
727
728 /* Don't use the SKB hash if we change the lookup keys from conntrack */
729 if (nat_enabled && cake_update_flowkeys(&keys, skb))
730 use_skbhash = false;
731
732 /* If we can still use the SKB hash and don't need the host hash, we can
733 * skip the rest of the hashing procedure
734 */
735 if (use_skbhash && !hash_hosts)
736 goto skip_hash;
737
738 /* flow_hash_from_keys() sorts the addresses by value, so we have
739 * to preserve their order in a separate data structure to treat
740 * src and dst host addresses as independently selectable.
741 */
742 host_keys = keys;
743 host_keys.ports.ports = 0;
744 host_keys.basic.ip_proto = 0;
745 host_keys.keyid.keyid = 0;
746 host_keys.tags.flow_label = 0;
747
748 switch (host_keys.control.addr_type) {
749 case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
750 host_keys.addrs.v4addrs.src = 0;
751 dsthost_hash = flow_hash_from_keys(&host_keys);
752 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
753 host_keys.addrs.v4addrs.dst = 0;
754 srchost_hash = flow_hash_from_keys(&host_keys);
755 break;
756
757 case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
758 memset(&host_keys.addrs.v6addrs.src, 0,
759 sizeof(host_keys.addrs.v6addrs.src));
760 dsthost_hash = flow_hash_from_keys(&host_keys);
761 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
762 memset(&host_keys.addrs.v6addrs.dst, 0,
763 sizeof(host_keys.addrs.v6addrs.dst));
764 srchost_hash = flow_hash_from_keys(&host_keys);
765 break;
766
767 default:
768 dsthost_hash = 0;
769 srchost_hash = 0;
770 }
771
772 /* This *must* be after the above switch, since as a
773 * side-effect it sorts the src and dst addresses.
774 */
775 if (hash_flows && !use_skbhash)
776 flow_hash = flow_hash_from_keys(&keys);
777
778 skip_hash:
779 if (flow_override)
780 flow_hash = flow_override - 1;
781 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
782 flow_hash = skb->hash;
783 if (host_override) {
784 dsthost_hash = host_override - 1;
785 srchost_hash = host_override - 1;
786 }
787
788 if (!(flow_mode & CAKE_FLOW_FLOWS)) {
789 if (flow_mode & CAKE_FLOW_SRC_IP)
790 flow_hash ^= srchost_hash;
791
792 if (flow_mode & CAKE_FLOW_DST_IP)
793 flow_hash ^= dsthost_hash;
794 }
795
796 reduced_hash = flow_hash % CAKE_QUEUES;
797
798 /* set-associative hashing */
799 /* fast path if no hash collision (direct lookup succeeds) */
800 if (likely(q->tags[reduced_hash] == flow_hash &&
801 q->flows[reduced_hash].set)) {
802 q->way_directs++;
803 } else {
804 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
805 u32 outer_hash = reduced_hash - inner_hash;
806 bool allocate_src = false;
807 bool allocate_dst = false;
808 u32 i, k;
809
810 /* check if any active queue in the set is reserved for
811 * this flow.
812 */
813 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
814 i++, k = (k + 1) % CAKE_SET_WAYS) {
815 if (q->tags[outer_hash + k] == flow_hash) {
816 if (i)
817 q->way_hits++;
818
819 if (!q->flows[outer_hash + k].set) {
820 /* need to increment host refcnts */
821 allocate_src = cake_dsrc(flow_mode);
822 allocate_dst = cake_ddst(flow_mode);
823 }
824
825 goto found;
826 }
827 }
828
829 /* no queue is reserved for this flow, look for an
830 * empty one.
831 */
832 for (i = 0; i < CAKE_SET_WAYS;
833 i++, k = (k + 1) % CAKE_SET_WAYS) {
834 if (!q->flows[outer_hash + k].set) {
835 q->way_misses++;
836 allocate_src = cake_dsrc(flow_mode);
837 allocate_dst = cake_ddst(flow_mode);
838 goto found;
839 }
840 }
841
842 /* With no empty queues, default to the original
843 * queue, accept the collision, update the host tags.
844 */
845 q->way_collisions++;
846 allocate_src = cake_dsrc(flow_mode);
847 allocate_dst = cake_ddst(flow_mode);
848
849 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
850 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
852 }
853 found:
854 /* reserve queue for future packets in same flow */
855 reduced_hash = outer_hash + k;
856 q->tags[reduced_hash] = flow_hash;
857
858 if (allocate_src) {
859 srchost_idx = srchost_hash % CAKE_QUEUES;
860 inner_hash = srchost_idx % CAKE_SET_WAYS;
861 outer_hash = srchost_idx - inner_hash;
862 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
863 i++, k = (k + 1) % CAKE_SET_WAYS) {
864 if (q->hosts[outer_hash + k].srchost_tag ==
865 srchost_hash)
866 goto found_src;
867 }
868 for (i = 0; i < CAKE_SET_WAYS;
869 i++, k = (k + 1) % CAKE_SET_WAYS) {
870 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
871 break;
872 }
873 q->hosts[outer_hash + k].srchost_tag = srchost_hash;
874 found_src:
875 srchost_idx = outer_hash + k;
876 q->flows[reduced_hash].srchost = srchost_idx;
877
878 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
879 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
880 }
881
882 if (allocate_dst) {
883 dsthost_idx = dsthost_hash % CAKE_QUEUES;
884 inner_hash = dsthost_idx % CAKE_SET_WAYS;
885 outer_hash = dsthost_idx - inner_hash;
886 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
887 i++, k = (k + 1) % CAKE_SET_WAYS) {
888 if (q->hosts[outer_hash + k].dsthost_tag ==
889 dsthost_hash)
890 goto found_dst;
891 }
892 for (i = 0; i < CAKE_SET_WAYS;
893 i++, k = (k + 1) % CAKE_SET_WAYS) {
894 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
895 break;
896 }
897 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
898 found_dst:
899 dsthost_idx = outer_hash + k;
900 q->flows[reduced_hash].dsthost = dsthost_idx;
901
902 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
903 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
904 }
905 }
906
907 return reduced_hash;
908 }
909
910 /* helper functions : might be changed when/if skb use a standard list_head */
911 /* remove one skb from head of slot queue */
912
dequeue_head(struct cake_flow * flow)913 static struct sk_buff *dequeue_head(struct cake_flow *flow)
914 {
915 struct sk_buff *skb = flow->head;
916
917 if (skb) {
918 flow->head = skb->next;
919 skb_mark_not_on_list(skb);
920 }
921
922 return skb;
923 }
924
925 /* add skb to flow queue (tail add) */
926
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)927 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
928 {
929 if (!flow->head)
930 flow->head = skb;
931 else
932 flow->tail->next = skb;
933 flow->tail = skb;
934 skb->next = NULL;
935 }
936
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)937 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
938 struct ipv6hdr *buf)
939 {
940 unsigned int offset = skb_network_offset(skb);
941 struct iphdr *iph;
942
943 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
944
945 if (!iph)
946 return NULL;
947
948 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
949 return skb_header_pointer(skb, offset + iph->ihl * 4,
950 sizeof(struct ipv6hdr), buf);
951
952 else if (iph->version == 4)
953 return iph;
954
955 else if (iph->version == 6)
956 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
957 buf);
958
959 return NULL;
960 }
961
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)962 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
963 void *buf, unsigned int bufsize)
964 {
965 unsigned int offset = skb_network_offset(skb);
966 const struct ipv6hdr *ipv6h;
967 const struct tcphdr *tcph;
968 const struct iphdr *iph;
969 struct ipv6hdr _ipv6h;
970 struct tcphdr _tcph;
971
972 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
973
974 if (!ipv6h)
975 return NULL;
976
977 if (ipv6h->version == 4) {
978 iph = (struct iphdr *)ipv6h;
979 offset += iph->ihl * 4;
980
981 /* special-case 6in4 tunnelling, as that is a common way to get
982 * v6 connectivity in the home
983 */
984 if (iph->protocol == IPPROTO_IPV6) {
985 ipv6h = skb_header_pointer(skb, offset,
986 sizeof(_ipv6h), &_ipv6h);
987
988 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
989 return NULL;
990
991 offset += sizeof(struct ipv6hdr);
992
993 } else if (iph->protocol != IPPROTO_TCP) {
994 return NULL;
995 }
996
997 } else if (ipv6h->version == 6) {
998 if (ipv6h->nexthdr != IPPROTO_TCP)
999 return NULL;
1000
1001 offset += sizeof(struct ipv6hdr);
1002 } else {
1003 return NULL;
1004 }
1005
1006 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1007 if (!tcph || tcph->doff < 5)
1008 return NULL;
1009
1010 return skb_header_pointer(skb, offset,
1011 min(__tcp_hdrlen(tcph), bufsize), buf);
1012 }
1013
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1014 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1015 int code, int *oplen)
1016 {
1017 /* inspired by tcp_parse_options in tcp_input.c */
1018 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1019 const u8 *ptr = (const u8 *)(tcph + 1);
1020
1021 while (length > 0) {
1022 int opcode = *ptr++;
1023 int opsize;
1024
1025 if (opcode == TCPOPT_EOL)
1026 break;
1027 if (opcode == TCPOPT_NOP) {
1028 length--;
1029 continue;
1030 }
1031 if (length < 2)
1032 break;
1033 opsize = *ptr++;
1034 if (opsize < 2 || opsize > length)
1035 break;
1036
1037 if (opcode == code) {
1038 *oplen = opsize;
1039 return ptr;
1040 }
1041
1042 ptr += opsize - 2;
1043 length -= opsize;
1044 }
1045
1046 return NULL;
1047 }
1048
1049 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1050 * bytes than the other. In the case where both sequences ACKs bytes that the
1051 * other doesn't, A is considered greater. DSACKs in A also makes A be
1052 * considered greater.
1053 *
1054 * @return -1, 0 or 1 as normal compare functions
1055 */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1056 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1057 const struct tcphdr *tcph_b)
1058 {
1059 const struct tcp_sack_block_wire *sack_a, *sack_b;
1060 u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1061 u32 bytes_a = 0, bytes_b = 0;
1062 int oplen_a, oplen_b;
1063 bool first = true;
1064
1065 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1066 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1067
1068 /* pointers point to option contents */
1069 oplen_a -= TCPOLEN_SACK_BASE;
1070 oplen_b -= TCPOLEN_SACK_BASE;
1071
1072 if (sack_a && oplen_a >= sizeof(*sack_a) &&
1073 (!sack_b || oplen_b < sizeof(*sack_b)))
1074 return -1;
1075 else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1076 (!sack_a || oplen_a < sizeof(*sack_a)))
1077 return 1;
1078 else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1079 (!sack_b || oplen_b < sizeof(*sack_b)))
1080 return 0;
1081
1082 while (oplen_a >= sizeof(*sack_a)) {
1083 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1084 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1085 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1086 int oplen_tmp = oplen_b;
1087 bool found = false;
1088
1089 /* DSACK; always considered greater to prevent dropping */
1090 if (before(start_a, ack_seq_a))
1091 return -1;
1092
1093 bytes_a += end_a - start_a;
1094
1095 while (oplen_tmp >= sizeof(*sack_tmp)) {
1096 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1097 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1098
1099 /* first time through we count the total size */
1100 if (first)
1101 bytes_b += end_b - start_b;
1102
1103 if (!after(start_b, start_a) && !before(end_b, end_a)) {
1104 found = true;
1105 if (!first)
1106 break;
1107 }
1108 oplen_tmp -= sizeof(*sack_tmp);
1109 sack_tmp++;
1110 }
1111
1112 if (!found)
1113 return -1;
1114
1115 oplen_a -= sizeof(*sack_a);
1116 sack_a++;
1117 first = false;
1118 }
1119
1120 /* If we made it this far, all ranges SACKed by A are covered by B, so
1121 * either the SACKs are equal, or B SACKs more bytes.
1122 */
1123 return bytes_b > bytes_a ? 1 : 0;
1124 }
1125
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1126 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1127 u32 *tsval, u32 *tsecr)
1128 {
1129 const u8 *ptr;
1130 int opsize;
1131
1132 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1133
1134 if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1135 *tsval = get_unaligned_be32(ptr);
1136 *tsecr = get_unaligned_be32(ptr + 4);
1137 }
1138 }
1139
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1140 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1141 u32 tstamp_new, u32 tsecr_new)
1142 {
1143 /* inspired by tcp_parse_options in tcp_input.c */
1144 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1145 const u8 *ptr = (const u8 *)(tcph + 1);
1146 u32 tstamp, tsecr;
1147
1148 /* 3 reserved flags must be unset to avoid future breakage
1149 * ACK must be set
1150 * ECE/CWR are handled separately
1151 * All other flags URG/PSH/RST/SYN/FIN must be unset
1152 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1153 * 0x00C00000 = CWR/ECE (handled separately)
1154 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1155 */
1156 if (((tcp_flag_word(tcph) &
1157 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1158 return false;
1159
1160 while (length > 0) {
1161 int opcode = *ptr++;
1162 int opsize;
1163
1164 if (opcode == TCPOPT_EOL)
1165 break;
1166 if (opcode == TCPOPT_NOP) {
1167 length--;
1168 continue;
1169 }
1170 if (length < 2)
1171 break;
1172 opsize = *ptr++;
1173 if (opsize < 2 || opsize > length)
1174 break;
1175
1176 switch (opcode) {
1177 case TCPOPT_MD5SIG: /* doesn't influence state */
1178 break;
1179
1180 case TCPOPT_SACK: /* stricter checking performed later */
1181 if (opsize % 8 != 2)
1182 return false;
1183 break;
1184
1185 case TCPOPT_TIMESTAMP:
1186 /* only drop timestamps lower than new */
1187 if (opsize != TCPOLEN_TIMESTAMP)
1188 return false;
1189 tstamp = get_unaligned_be32(ptr);
1190 tsecr = get_unaligned_be32(ptr + 4);
1191 if (after(tstamp, tstamp_new) ||
1192 after(tsecr, tsecr_new))
1193 return false;
1194 break;
1195
1196 case TCPOPT_MSS: /* these should only be set on SYN */
1197 case TCPOPT_WINDOW:
1198 case TCPOPT_SACK_PERM:
1199 case TCPOPT_FASTOPEN:
1200 case TCPOPT_EXP:
1201 default: /* don't drop if any unknown options are present */
1202 return false;
1203 }
1204
1205 ptr += opsize - 2;
1206 length -= opsize;
1207 }
1208
1209 return true;
1210 }
1211
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1212 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1213 struct cake_flow *flow)
1214 {
1215 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1216 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1217 struct sk_buff *skb_check, *skb_prev = NULL;
1218 const struct ipv6hdr *ipv6h, *ipv6h_check;
1219 unsigned char _tcph[64], _tcph_check[64];
1220 const struct tcphdr *tcph, *tcph_check;
1221 const struct iphdr *iph, *iph_check;
1222 struct ipv6hdr _iph, _iph_check;
1223 const struct sk_buff *skb;
1224 int seglen, num_found = 0;
1225 u32 tstamp = 0, tsecr = 0;
1226 __be32 elig_flags = 0;
1227 int sack_comp;
1228
1229 /* no other possible ACKs to filter */
1230 if (flow->head == flow->tail)
1231 return NULL;
1232
1233 skb = flow->tail;
1234 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1235 iph = cake_get_iphdr(skb, &_iph);
1236 if (!tcph)
1237 return NULL;
1238
1239 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1240
1241 /* the 'triggering' packet need only have the ACK flag set.
1242 * also check that SYN is not set, as there won't be any previous ACKs.
1243 */
1244 if ((tcp_flag_word(tcph) &
1245 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1246 return NULL;
1247
1248 /* the 'triggering' ACK is at the tail of the queue, we have already
1249 * returned if it is the only packet in the flow. loop through the rest
1250 * of the queue looking for pure ACKs with the same 5-tuple as the
1251 * triggering one.
1252 */
1253 for (skb_check = flow->head;
1254 skb_check && skb_check != skb;
1255 skb_prev = skb_check, skb_check = skb_check->next) {
1256 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1257 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1258 sizeof(_tcph_check));
1259
1260 /* only TCP packets with matching 5-tuple are eligible, and only
1261 * drop safe headers
1262 */
1263 if (!tcph_check || iph->version != iph_check->version ||
1264 tcph_check->source != tcph->source ||
1265 tcph_check->dest != tcph->dest)
1266 continue;
1267
1268 if (iph_check->version == 4) {
1269 if (iph_check->saddr != iph->saddr ||
1270 iph_check->daddr != iph->daddr)
1271 continue;
1272
1273 seglen = iph_totlen(skb, iph_check) -
1274 (4 * iph_check->ihl);
1275 } else if (iph_check->version == 6) {
1276 ipv6h = (struct ipv6hdr *)iph;
1277 ipv6h_check = (struct ipv6hdr *)iph_check;
1278
1279 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1280 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1281 continue;
1282
1283 seglen = ntohs(ipv6h_check->payload_len);
1284 } else {
1285 WARN_ON(1); /* shouldn't happen */
1286 continue;
1287 }
1288
1289 /* If the ECE/CWR flags changed from the previous eligible
1290 * packet in the same flow, we should no longer be dropping that
1291 * previous packet as this would lose information.
1292 */
1293 if (elig_ack && (tcp_flag_word(tcph_check) &
1294 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1295 elig_ack = NULL;
1296 elig_ack_prev = NULL;
1297 num_found--;
1298 }
1299
1300 /* Check TCP options and flags, don't drop ACKs with segment
1301 * data, and don't drop ACKs with a higher cumulative ACK
1302 * counter than the triggering packet. Check ACK seqno here to
1303 * avoid parsing SACK options of packets we are going to exclude
1304 * anyway.
1305 */
1306 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1307 (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1308 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1309 continue;
1310
1311 /* Check SACK options. The triggering packet must SACK more data
1312 * than the ACK under consideration, or SACK the same range but
1313 * have a larger cumulative ACK counter. The latter is a
1314 * pathological case, but is contained in the following check
1315 * anyway, just to be safe.
1316 */
1317 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1318
1319 if (sack_comp < 0 ||
1320 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1321 sack_comp == 0))
1322 continue;
1323
1324 /* At this point we have found an eligible pure ACK to drop; if
1325 * we are in aggressive mode, we are done. Otherwise, keep
1326 * searching unless this is the second eligible ACK we
1327 * found.
1328 *
1329 * Since we want to drop ACK closest to the head of the queue,
1330 * save the first eligible ACK we find, even if we need to loop
1331 * again.
1332 */
1333 if (!elig_ack) {
1334 elig_ack = skb_check;
1335 elig_ack_prev = skb_prev;
1336 elig_flags = (tcp_flag_word(tcph_check)
1337 & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1338 }
1339
1340 if (num_found++ > 0)
1341 goto found;
1342 }
1343
1344 /* We made it through the queue without finding two eligible ACKs . If
1345 * we found a single eligible ACK we can drop it in aggressive mode if
1346 * we can guarantee that this does not interfere with ECN flag
1347 * information. We ensure this by dropping it only if the enqueued
1348 * packet is consecutive with the eligible ACK, and their flags match.
1349 */
1350 if (elig_ack && aggressive && elig_ack->next == skb &&
1351 (elig_flags == (tcp_flag_word(tcph) &
1352 (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1353 goto found;
1354
1355 return NULL;
1356
1357 found:
1358 if (elig_ack_prev)
1359 elig_ack_prev->next = elig_ack->next;
1360 else
1361 flow->head = elig_ack->next;
1362
1363 skb_mark_not_on_list(elig_ack);
1364
1365 return elig_ack;
1366 }
1367
cake_ewma(u64 avg,u64 sample,u32 shift)1368 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1369 {
1370 avg -= avg >> shift;
1371 avg += sample >> shift;
1372 return avg;
1373 }
1374
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1375 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1376 {
1377 if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1378 len -= off;
1379
1380 if (q->max_netlen < len)
1381 q->max_netlen = len;
1382 if (q->min_netlen > len)
1383 q->min_netlen = len;
1384
1385 len += q->rate_overhead;
1386
1387 if (len < q->rate_mpu)
1388 len = q->rate_mpu;
1389
1390 if (q->atm_mode == CAKE_ATM_ATM) {
1391 len += 47;
1392 len /= 48;
1393 len *= 53;
1394 } else if (q->atm_mode == CAKE_ATM_PTM) {
1395 /* Add one byte per 64 bytes or part thereof.
1396 * This is conservative and easier to calculate than the
1397 * precise value.
1398 */
1399 len += (len + 63) / 64;
1400 }
1401
1402 if (q->max_adjlen < len)
1403 q->max_adjlen = len;
1404 if (q->min_adjlen > len)
1405 q->min_adjlen = len;
1406
1407 return len;
1408 }
1409
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1410 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1411 {
1412 const struct skb_shared_info *shinfo = skb_shinfo(skb);
1413 unsigned int hdr_len, last_len = 0;
1414 u32 off = skb_network_offset(skb);
1415 u32 len = qdisc_pkt_len(skb);
1416 u16 segs = 1;
1417
1418 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1419
1420 if (!shinfo->gso_size)
1421 return cake_calc_overhead(q, len, off);
1422
1423 /* borrowed from qdisc_pkt_len_init() */
1424 hdr_len = skb_transport_offset(skb);
1425
1426 /* + transport layer */
1427 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1428 SKB_GSO_TCPV6))) {
1429 const struct tcphdr *th;
1430 struct tcphdr _tcphdr;
1431
1432 th = skb_header_pointer(skb, hdr_len,
1433 sizeof(_tcphdr), &_tcphdr);
1434 if (likely(th))
1435 hdr_len += __tcp_hdrlen(th);
1436 } else {
1437 struct udphdr _udphdr;
1438
1439 if (skb_header_pointer(skb, hdr_len,
1440 sizeof(_udphdr), &_udphdr))
1441 hdr_len += sizeof(struct udphdr);
1442 }
1443
1444 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1445 segs = DIV_ROUND_UP(skb->len - hdr_len,
1446 shinfo->gso_size);
1447 else
1448 segs = shinfo->gso_segs;
1449
1450 len = shinfo->gso_size + hdr_len;
1451 last_len = skb->len - shinfo->gso_size * (segs - 1);
1452
1453 return (cake_calc_overhead(q, len, off) * (segs - 1) +
1454 cake_calc_overhead(q, last_len, off));
1455 }
1456
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1457 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1458 {
1459 struct cake_heap_entry ii = q->overflow_heap[i];
1460 struct cake_heap_entry jj = q->overflow_heap[j];
1461
1462 q->overflow_heap[i] = jj;
1463 q->overflow_heap[j] = ii;
1464
1465 q->tins[ii.t].overflow_idx[ii.b] = j;
1466 q->tins[jj.t].overflow_idx[jj.b] = i;
1467 }
1468
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1469 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1470 {
1471 struct cake_heap_entry ii = q->overflow_heap[i];
1472
1473 return q->tins[ii.t].backlogs[ii.b];
1474 }
1475
cake_heapify(struct cake_sched_data * q,u16 i)1476 static void cake_heapify(struct cake_sched_data *q, u16 i)
1477 {
1478 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1479 u32 mb = cake_heap_get_backlog(q, i);
1480 u32 m = i;
1481
1482 while (m < a) {
1483 u32 l = m + m + 1;
1484 u32 r = l + 1;
1485
1486 if (l < a) {
1487 u32 lb = cake_heap_get_backlog(q, l);
1488
1489 if (lb > mb) {
1490 m = l;
1491 mb = lb;
1492 }
1493 }
1494
1495 if (r < a) {
1496 u32 rb = cake_heap_get_backlog(q, r);
1497
1498 if (rb > mb) {
1499 m = r;
1500 mb = rb;
1501 }
1502 }
1503
1504 if (m != i) {
1505 cake_heap_swap(q, i, m);
1506 i = m;
1507 } else {
1508 break;
1509 }
1510 }
1511 }
1512
cake_heapify_up(struct cake_sched_data * q,u16 i)1513 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1514 {
1515 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1516 u16 p = (i - 1) >> 1;
1517 u32 ib = cake_heap_get_backlog(q, i);
1518 u32 pb = cake_heap_get_backlog(q, p);
1519
1520 if (ib > pb) {
1521 cake_heap_swap(q, i, p);
1522 i = p;
1523 } else {
1524 break;
1525 }
1526 }
1527 }
1528
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1529 static int cake_advance_shaper(struct cake_sched_data *q,
1530 struct cake_tin_data *b,
1531 struct sk_buff *skb,
1532 ktime_t now, bool drop)
1533 {
1534 u32 len = get_cobalt_cb(skb)->adjusted_len;
1535
1536 /* charge packet bandwidth to this tin
1537 * and to the global shaper.
1538 */
1539 if (q->rate_ns) {
1540 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1541 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1542 u64 failsafe_dur = global_dur + (global_dur >> 1);
1543
1544 if (ktime_before(b->time_next_packet, now))
1545 b->time_next_packet = ktime_add_ns(b->time_next_packet,
1546 tin_dur);
1547
1548 else if (ktime_before(b->time_next_packet,
1549 ktime_add_ns(now, tin_dur)))
1550 b->time_next_packet = ktime_add_ns(now, tin_dur);
1551
1552 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1553 global_dur);
1554 if (!drop)
1555 q->failsafe_next_packet = \
1556 ktime_add_ns(q->failsafe_next_packet,
1557 failsafe_dur);
1558 }
1559 return len;
1560 }
1561
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1562 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1563 {
1564 struct cake_sched_data *q = qdisc_priv(sch);
1565 ktime_t now = ktime_get();
1566 u32 idx = 0, tin = 0, len;
1567 struct cake_heap_entry qq;
1568 struct cake_tin_data *b;
1569 struct cake_flow *flow;
1570 struct sk_buff *skb;
1571
1572 if (!q->overflow_timeout) {
1573 int i;
1574 /* Build fresh max-heap */
1575 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1576 cake_heapify(q, i);
1577 }
1578 q->overflow_timeout = 65535;
1579
1580 /* select longest queue for pruning */
1581 qq = q->overflow_heap[0];
1582 tin = qq.t;
1583 idx = qq.b;
1584
1585 b = &q->tins[tin];
1586 flow = &b->flows[idx];
1587 skb = dequeue_head(flow);
1588 if (unlikely(!skb)) {
1589 /* heap has gone wrong, rebuild it next time */
1590 q->overflow_timeout = 0;
1591 return idx + (tin << 16);
1592 }
1593
1594 if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1595 b->unresponsive_flow_count++;
1596
1597 len = qdisc_pkt_len(skb);
1598 q->buffer_used -= skb->truesize;
1599 b->backlogs[idx] -= len;
1600 b->tin_backlog -= len;
1601 sch->qstats.backlog -= len;
1602
1603 flow->dropped++;
1604 b->tin_dropped++;
1605 sch->qstats.drops++;
1606
1607 if (q->rate_flags & CAKE_FLAG_INGRESS)
1608 cake_advance_shaper(q, b, skb, now, true);
1609
1610 __qdisc_drop(skb, to_free);
1611 sch->q.qlen--;
1612 qdisc_tree_reduce_backlog(sch, 1, len);
1613
1614 cake_heapify(q, 0);
1615
1616 return idx + (tin << 16);
1617 }
1618
cake_handle_diffserv(struct sk_buff * skb,bool wash)1619 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1620 {
1621 const int offset = skb_network_offset(skb);
1622 u16 *buf, buf_;
1623 u8 dscp;
1624
1625 switch (skb_protocol(skb, true)) {
1626 case htons(ETH_P_IP):
1627 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1628 if (unlikely(!buf))
1629 return 0;
1630
1631 /* ToS is in the second byte of iphdr */
1632 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1633
1634 if (wash && dscp) {
1635 const int wlen = offset + sizeof(struct iphdr);
1636
1637 if (!pskb_may_pull(skb, wlen) ||
1638 skb_try_make_writable(skb, wlen))
1639 return 0;
1640
1641 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1642 }
1643
1644 return dscp;
1645
1646 case htons(ETH_P_IPV6):
1647 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1648 if (unlikely(!buf))
1649 return 0;
1650
1651 /* Traffic class is in the first and second bytes of ipv6hdr */
1652 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1653
1654 if (wash && dscp) {
1655 const int wlen = offset + sizeof(struct ipv6hdr);
1656
1657 if (!pskb_may_pull(skb, wlen) ||
1658 skb_try_make_writable(skb, wlen))
1659 return 0;
1660
1661 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1662 }
1663
1664 return dscp;
1665
1666 case htons(ETH_P_ARP):
1667 return 0x38; /* CS7 - Net Control */
1668
1669 default:
1670 /* If there is no Diffserv field, treat as best-effort */
1671 return 0;
1672 }
1673 }
1674
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1675 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1676 struct sk_buff *skb)
1677 {
1678 struct cake_sched_data *q = qdisc_priv(sch);
1679 u32 tin, mark;
1680 bool wash;
1681 u8 dscp;
1682
1683 /* Tin selection: Default to diffserv-based selection, allow overriding
1684 * using firewall marks or skb->priority. Call DSCP parsing early if
1685 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1686 */
1687 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1688 wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1689 if (wash)
1690 dscp = cake_handle_diffserv(skb, wash);
1691
1692 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1693 tin = 0;
1694
1695 else if (mark && mark <= q->tin_cnt)
1696 tin = q->tin_order[mark - 1];
1697
1698 else if (TC_H_MAJ(skb->priority) == sch->handle &&
1699 TC_H_MIN(skb->priority) > 0 &&
1700 TC_H_MIN(skb->priority) <= q->tin_cnt)
1701 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1702
1703 else {
1704 if (!wash)
1705 dscp = cake_handle_diffserv(skb, wash);
1706 tin = q->tin_index[dscp];
1707
1708 if (unlikely(tin >= q->tin_cnt))
1709 tin = 0;
1710 }
1711
1712 return &q->tins[tin];
1713 }
1714
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1715 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1716 struct sk_buff *skb, int flow_mode, int *qerr)
1717 {
1718 struct cake_sched_data *q = qdisc_priv(sch);
1719 struct tcf_proto *filter;
1720 struct tcf_result res;
1721 u16 flow = 0, host = 0;
1722 int result;
1723
1724 filter = rcu_dereference_bh(q->filter_list);
1725 if (!filter)
1726 goto hash;
1727
1728 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1729 result = tcf_classify(skb, NULL, filter, &res, false);
1730
1731 if (result >= 0) {
1732 #ifdef CONFIG_NET_CLS_ACT
1733 switch (result) {
1734 case TC_ACT_STOLEN:
1735 case TC_ACT_QUEUED:
1736 case TC_ACT_TRAP:
1737 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1738 fallthrough;
1739 case TC_ACT_SHOT:
1740 return 0;
1741 }
1742 #endif
1743 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1744 flow = TC_H_MIN(res.classid);
1745 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1746 host = TC_H_MAJ(res.classid) >> 16;
1747 }
1748 hash:
1749 *t = cake_select_tin(sch, skb);
1750 return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1751 }
1752
1753 static void cake_reconfigure(struct Qdisc *sch);
1754
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1755 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1756 struct sk_buff **to_free)
1757 {
1758 struct cake_sched_data *q = qdisc_priv(sch);
1759 int len = qdisc_pkt_len(skb);
1760 int ret;
1761 struct sk_buff *ack = NULL;
1762 ktime_t now = ktime_get();
1763 struct cake_tin_data *b;
1764 struct cake_flow *flow;
1765 u32 idx;
1766
1767 /* choose flow to insert into */
1768 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1769 if (idx == 0) {
1770 if (ret & __NET_XMIT_BYPASS)
1771 qdisc_qstats_drop(sch);
1772 __qdisc_drop(skb, to_free);
1773 return ret;
1774 }
1775 idx--;
1776 flow = &b->flows[idx];
1777
1778 /* ensure shaper state isn't stale */
1779 if (!b->tin_backlog) {
1780 if (ktime_before(b->time_next_packet, now))
1781 b->time_next_packet = now;
1782
1783 if (!sch->q.qlen) {
1784 if (ktime_before(q->time_next_packet, now)) {
1785 q->failsafe_next_packet = now;
1786 q->time_next_packet = now;
1787 } else if (ktime_after(q->time_next_packet, now) &&
1788 ktime_after(q->failsafe_next_packet, now)) {
1789 u64 next = \
1790 min(ktime_to_ns(q->time_next_packet),
1791 ktime_to_ns(
1792 q->failsafe_next_packet));
1793 sch->qstats.overlimits++;
1794 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1795 }
1796 }
1797 }
1798
1799 if (unlikely(len > b->max_skblen))
1800 b->max_skblen = len;
1801
1802 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1803 struct sk_buff *segs, *nskb;
1804 netdev_features_t features = netif_skb_features(skb);
1805 unsigned int slen = 0, numsegs = 0;
1806
1807 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1808 if (IS_ERR_OR_NULL(segs))
1809 return qdisc_drop(skb, sch, to_free);
1810
1811 skb_list_walk_safe(segs, segs, nskb) {
1812 skb_mark_not_on_list(segs);
1813 qdisc_skb_cb(segs)->pkt_len = segs->len;
1814 cobalt_set_enqueue_time(segs, now);
1815 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1816 segs);
1817 flow_queue_add(flow, segs);
1818
1819 sch->q.qlen++;
1820 numsegs++;
1821 slen += segs->len;
1822 q->buffer_used += segs->truesize;
1823 b->packets++;
1824 }
1825
1826 /* stats */
1827 b->bytes += slen;
1828 b->backlogs[idx] += slen;
1829 b->tin_backlog += slen;
1830 sch->qstats.backlog += slen;
1831 q->avg_window_bytes += slen;
1832
1833 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1834 consume_skb(skb);
1835 } else {
1836 /* not splitting */
1837 cobalt_set_enqueue_time(skb, now);
1838 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1839 flow_queue_add(flow, skb);
1840
1841 if (q->ack_filter)
1842 ack = cake_ack_filter(q, flow);
1843
1844 if (ack) {
1845 b->ack_drops++;
1846 sch->qstats.drops++;
1847 b->bytes += qdisc_pkt_len(ack);
1848 len -= qdisc_pkt_len(ack);
1849 q->buffer_used += skb->truesize - ack->truesize;
1850 if (q->rate_flags & CAKE_FLAG_INGRESS)
1851 cake_advance_shaper(q, b, ack, now, true);
1852
1853 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1854 consume_skb(ack);
1855 } else {
1856 sch->q.qlen++;
1857 q->buffer_used += skb->truesize;
1858 }
1859
1860 /* stats */
1861 b->packets++;
1862 b->bytes += len;
1863 b->backlogs[idx] += len;
1864 b->tin_backlog += len;
1865 sch->qstats.backlog += len;
1866 q->avg_window_bytes += len;
1867 }
1868
1869 if (q->overflow_timeout)
1870 cake_heapify_up(q, b->overflow_idx[idx]);
1871
1872 /* incoming bandwidth capacity estimate */
1873 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1874 u64 packet_interval = \
1875 ktime_to_ns(ktime_sub(now, q->last_packet_time));
1876
1877 if (packet_interval > NSEC_PER_SEC)
1878 packet_interval = NSEC_PER_SEC;
1879
1880 /* filter out short-term bursts, eg. wifi aggregation */
1881 q->avg_packet_interval = \
1882 cake_ewma(q->avg_packet_interval,
1883 packet_interval,
1884 (packet_interval > q->avg_packet_interval ?
1885 2 : 8));
1886
1887 q->last_packet_time = now;
1888
1889 if (packet_interval > q->avg_packet_interval) {
1890 u64 window_interval = \
1891 ktime_to_ns(ktime_sub(now,
1892 q->avg_window_begin));
1893 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1894
1895 b = div64_u64(b, window_interval);
1896 q->avg_peak_bandwidth =
1897 cake_ewma(q->avg_peak_bandwidth, b,
1898 b > q->avg_peak_bandwidth ? 2 : 8);
1899 q->avg_window_bytes = 0;
1900 q->avg_window_begin = now;
1901
1902 if (ktime_after(now,
1903 ktime_add_ms(q->last_reconfig_time,
1904 250))) {
1905 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1906 cake_reconfigure(sch);
1907 }
1908 }
1909 } else {
1910 q->avg_window_bytes = 0;
1911 q->last_packet_time = now;
1912 }
1913
1914 /* flowchain */
1915 if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1916 if (!flow->set) {
1917 list_add_tail(&flow->flowchain, &b->new_flows);
1918 } else {
1919 b->decaying_flow_count--;
1920 list_move_tail(&flow->flowchain, &b->new_flows);
1921 }
1922 flow->set = CAKE_SET_SPARSE;
1923 b->sparse_flow_count++;
1924
1925 flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1926 } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1927 /* this flow was empty, accounted as a sparse flow, but actually
1928 * in the bulk rotation.
1929 */
1930 flow->set = CAKE_SET_BULK;
1931 b->sparse_flow_count--;
1932 b->bulk_flow_count++;
1933
1934 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1935 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1936 }
1937
1938 if (q->buffer_used > q->buffer_max_used)
1939 q->buffer_max_used = q->buffer_used;
1940
1941 if (q->buffer_used > q->buffer_limit) {
1942 u32 dropped = 0;
1943
1944 while (q->buffer_used > q->buffer_limit) {
1945 dropped++;
1946 cake_drop(sch, to_free);
1947 }
1948 b->drop_overlimit += dropped;
1949 }
1950 return NET_XMIT_SUCCESS;
1951 }
1952
cake_dequeue_one(struct Qdisc * sch)1953 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1954 {
1955 struct cake_sched_data *q = qdisc_priv(sch);
1956 struct cake_tin_data *b = &q->tins[q->cur_tin];
1957 struct cake_flow *flow = &b->flows[q->cur_flow];
1958 struct sk_buff *skb = NULL;
1959 u32 len;
1960
1961 if (flow->head) {
1962 skb = dequeue_head(flow);
1963 len = qdisc_pkt_len(skb);
1964 b->backlogs[q->cur_flow] -= len;
1965 b->tin_backlog -= len;
1966 sch->qstats.backlog -= len;
1967 q->buffer_used -= skb->truesize;
1968 sch->q.qlen--;
1969
1970 if (q->overflow_timeout)
1971 cake_heapify(q, b->overflow_idx[q->cur_flow]);
1972 }
1973 return skb;
1974 }
1975
1976 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1977 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1978 {
1979 struct cake_sched_data *q = qdisc_priv(sch);
1980 struct sk_buff *skb;
1981
1982 q->cur_tin = tin;
1983 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1984 while (!!(skb = cake_dequeue_one(sch)))
1985 kfree_skb(skb);
1986 }
1987
cake_dequeue(struct Qdisc * sch)1988 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1989 {
1990 struct cake_sched_data *q = qdisc_priv(sch);
1991 struct cake_tin_data *b = &q->tins[q->cur_tin];
1992 ktime_t now = ktime_get();
1993 struct cake_flow *flow;
1994 struct list_head *head;
1995 bool first_flow = true;
1996 struct sk_buff *skb;
1997 u64 delay;
1998 u32 len;
1999
2000 begin:
2001 if (!sch->q.qlen)
2002 return NULL;
2003
2004 /* global hard shaper */
2005 if (ktime_after(q->time_next_packet, now) &&
2006 ktime_after(q->failsafe_next_packet, now)) {
2007 u64 next = min(ktime_to_ns(q->time_next_packet),
2008 ktime_to_ns(q->failsafe_next_packet));
2009
2010 sch->qstats.overlimits++;
2011 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2012 return NULL;
2013 }
2014
2015 /* Choose a class to work on. */
2016 if (!q->rate_ns) {
2017 /* In unlimited mode, can't rely on shaper timings, just balance
2018 * with DRR
2019 */
2020 bool wrapped = false, empty = true;
2021
2022 while (b->tin_deficit < 0 ||
2023 !(b->sparse_flow_count + b->bulk_flow_count)) {
2024 if (b->tin_deficit <= 0)
2025 b->tin_deficit += b->tin_quantum;
2026 if (b->sparse_flow_count + b->bulk_flow_count)
2027 empty = false;
2028
2029 q->cur_tin++;
2030 b++;
2031 if (q->cur_tin >= q->tin_cnt) {
2032 q->cur_tin = 0;
2033 b = q->tins;
2034
2035 if (wrapped) {
2036 /* It's possible for q->qlen to be
2037 * nonzero when we actually have no
2038 * packets anywhere.
2039 */
2040 if (empty)
2041 return NULL;
2042 } else {
2043 wrapped = true;
2044 }
2045 }
2046 }
2047 } else {
2048 /* In shaped mode, choose:
2049 * - Highest-priority tin with queue and meeting schedule, or
2050 * - The earliest-scheduled tin with queue.
2051 */
2052 ktime_t best_time = KTIME_MAX;
2053 int tin, best_tin = 0;
2054
2055 for (tin = 0; tin < q->tin_cnt; tin++) {
2056 b = q->tins + tin;
2057 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2058 ktime_t time_to_pkt = \
2059 ktime_sub(b->time_next_packet, now);
2060
2061 if (ktime_to_ns(time_to_pkt) <= 0 ||
2062 ktime_compare(time_to_pkt,
2063 best_time) <= 0) {
2064 best_time = time_to_pkt;
2065 best_tin = tin;
2066 }
2067 }
2068 }
2069
2070 q->cur_tin = best_tin;
2071 b = q->tins + best_tin;
2072
2073 /* No point in going further if no packets to deliver. */
2074 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2075 return NULL;
2076 }
2077
2078 retry:
2079 /* service this class */
2080 head = &b->decaying_flows;
2081 if (!first_flow || list_empty(head)) {
2082 head = &b->new_flows;
2083 if (list_empty(head)) {
2084 head = &b->old_flows;
2085 if (unlikely(list_empty(head))) {
2086 head = &b->decaying_flows;
2087 if (unlikely(list_empty(head)))
2088 goto begin;
2089 }
2090 }
2091 }
2092 flow = list_first_entry(head, struct cake_flow, flowchain);
2093 q->cur_flow = flow - b->flows;
2094 first_flow = false;
2095
2096 /* flow isolation (DRR++) */
2097 if (flow->deficit <= 0) {
2098 /* Keep all flows with deficits out of the sparse and decaying
2099 * rotations. No non-empty flow can go into the decaying
2100 * rotation, so they can't get deficits
2101 */
2102 if (flow->set == CAKE_SET_SPARSE) {
2103 if (flow->head) {
2104 b->sparse_flow_count--;
2105 b->bulk_flow_count++;
2106
2107 cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2108 cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2109
2110 flow->set = CAKE_SET_BULK;
2111 } else {
2112 /* we've moved it to the bulk rotation for
2113 * correct deficit accounting but we still want
2114 * to count it as a sparse flow, not a bulk one.
2115 */
2116 flow->set = CAKE_SET_SPARSE_WAIT;
2117 }
2118 }
2119
2120 flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2121 list_move_tail(&flow->flowchain, &b->old_flows);
2122
2123 goto retry;
2124 }
2125
2126 /* Retrieve a packet via the AQM */
2127 while (1) {
2128 skb = cake_dequeue_one(sch);
2129 if (!skb) {
2130 /* this queue was actually empty */
2131 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2132 b->unresponsive_flow_count--;
2133
2134 if (flow->cvars.p_drop || flow->cvars.count ||
2135 ktime_before(now, flow->cvars.drop_next)) {
2136 /* keep in the flowchain until the state has
2137 * decayed to rest
2138 */
2139 list_move_tail(&flow->flowchain,
2140 &b->decaying_flows);
2141 if (flow->set == CAKE_SET_BULK) {
2142 b->bulk_flow_count--;
2143
2144 cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2145 cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2146
2147 b->decaying_flow_count++;
2148 } else if (flow->set == CAKE_SET_SPARSE ||
2149 flow->set == CAKE_SET_SPARSE_WAIT) {
2150 b->sparse_flow_count--;
2151 b->decaying_flow_count++;
2152 }
2153 flow->set = CAKE_SET_DECAYING;
2154 } else {
2155 /* remove empty queue from the flowchain */
2156 list_del_init(&flow->flowchain);
2157 if (flow->set == CAKE_SET_SPARSE ||
2158 flow->set == CAKE_SET_SPARSE_WAIT)
2159 b->sparse_flow_count--;
2160 else if (flow->set == CAKE_SET_BULK) {
2161 b->bulk_flow_count--;
2162
2163 cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2164 cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2165 } else
2166 b->decaying_flow_count--;
2167
2168 flow->set = CAKE_SET_NONE;
2169 }
2170 goto begin;
2171 }
2172
2173 /* Last packet in queue may be marked, shouldn't be dropped */
2174 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2175 (b->bulk_flow_count *
2176 !!(q->rate_flags &
2177 CAKE_FLAG_INGRESS))) ||
2178 !flow->head)
2179 break;
2180
2181 /* drop this packet, get another one */
2182 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2183 len = cake_advance_shaper(q, b, skb,
2184 now, true);
2185 flow->deficit -= len;
2186 b->tin_deficit -= len;
2187 }
2188 flow->dropped++;
2189 b->tin_dropped++;
2190 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2191 qdisc_qstats_drop(sch);
2192 kfree_skb(skb);
2193 if (q->rate_flags & CAKE_FLAG_INGRESS)
2194 goto retry;
2195 }
2196
2197 b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2198 qdisc_bstats_update(sch, skb);
2199
2200 /* collect delay stats */
2201 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2202 b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2203 b->peak_delay = cake_ewma(b->peak_delay, delay,
2204 delay > b->peak_delay ? 2 : 8);
2205 b->base_delay = cake_ewma(b->base_delay, delay,
2206 delay < b->base_delay ? 2 : 8);
2207
2208 len = cake_advance_shaper(q, b, skb, now, false);
2209 flow->deficit -= len;
2210 b->tin_deficit -= len;
2211
2212 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2213 u64 next = min(ktime_to_ns(q->time_next_packet),
2214 ktime_to_ns(q->failsafe_next_packet));
2215
2216 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2217 } else if (!sch->q.qlen) {
2218 int i;
2219
2220 for (i = 0; i < q->tin_cnt; i++) {
2221 if (q->tins[i].decaying_flow_count) {
2222 ktime_t next = \
2223 ktime_add_ns(now,
2224 q->tins[i].cparams.target);
2225
2226 qdisc_watchdog_schedule_ns(&q->watchdog,
2227 ktime_to_ns(next));
2228 break;
2229 }
2230 }
2231 }
2232
2233 if (q->overflow_timeout)
2234 q->overflow_timeout--;
2235
2236 return skb;
2237 }
2238
cake_reset(struct Qdisc * sch)2239 static void cake_reset(struct Qdisc *sch)
2240 {
2241 struct cake_sched_data *q = qdisc_priv(sch);
2242 u32 c;
2243
2244 if (!q->tins)
2245 return;
2246
2247 for (c = 0; c < CAKE_MAX_TINS; c++)
2248 cake_clear_tin(sch, c);
2249 }
2250
2251 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2252 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
2253 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2254 [TCA_CAKE_ATM] = { .type = NLA_U32 },
2255 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
2256 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
2257 [TCA_CAKE_RTT] = { .type = NLA_U32 },
2258 [TCA_CAKE_TARGET] = { .type = NLA_U32 },
2259 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
2260 [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
2261 [TCA_CAKE_NAT] = { .type = NLA_U32 },
2262 [TCA_CAKE_RAW] = { .type = NLA_U32 },
2263 [TCA_CAKE_WASH] = { .type = NLA_U32 },
2264 [TCA_CAKE_MPU] = { .type = NLA_U32 },
2265 [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
2266 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
2267 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
2268 [TCA_CAKE_FWMARK] = { .type = NLA_U32 },
2269 };
2270
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2271 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2272 u64 target_ns, u64 rtt_est_ns)
2273 {
2274 /* convert byte-rate into time-per-byte
2275 * so it will always unwedge in reasonable time.
2276 */
2277 static const u64 MIN_RATE = 64;
2278 u32 byte_target = mtu;
2279 u64 byte_target_ns;
2280 u8 rate_shft = 0;
2281 u64 rate_ns = 0;
2282
2283 b->flow_quantum = 1514;
2284 if (rate) {
2285 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2286 rate_shft = 34;
2287 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2288 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2289 while (!!(rate_ns >> 34)) {
2290 rate_ns >>= 1;
2291 rate_shft--;
2292 }
2293 } /* else unlimited, ie. zero delay */
2294
2295 b->tin_rate_bps = rate;
2296 b->tin_rate_ns = rate_ns;
2297 b->tin_rate_shft = rate_shft;
2298
2299 byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2300
2301 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2302 b->cparams.interval = max(rtt_est_ns +
2303 b->cparams.target - target_ns,
2304 b->cparams.target * 2);
2305 b->cparams.mtu_time = byte_target_ns;
2306 b->cparams.p_inc = 1 << 24; /* 1/256 */
2307 b->cparams.p_dec = 1 << 20; /* 1/4096 */
2308 }
2309
cake_config_besteffort(struct Qdisc * sch)2310 static int cake_config_besteffort(struct Qdisc *sch)
2311 {
2312 struct cake_sched_data *q = qdisc_priv(sch);
2313 struct cake_tin_data *b = &q->tins[0];
2314 u32 mtu = psched_mtu(qdisc_dev(sch));
2315 u64 rate = q->rate_bps;
2316
2317 q->tin_cnt = 1;
2318
2319 q->tin_index = besteffort;
2320 q->tin_order = normal_order;
2321
2322 cake_set_rate(b, rate, mtu,
2323 us_to_ns(q->target), us_to_ns(q->interval));
2324 b->tin_quantum = 65535;
2325
2326 return 0;
2327 }
2328
cake_config_precedence(struct Qdisc * sch)2329 static int cake_config_precedence(struct Qdisc *sch)
2330 {
2331 /* convert high-level (user visible) parameters into internal format */
2332 struct cake_sched_data *q = qdisc_priv(sch);
2333 u32 mtu = psched_mtu(qdisc_dev(sch));
2334 u64 rate = q->rate_bps;
2335 u32 quantum = 256;
2336 u32 i;
2337
2338 q->tin_cnt = 8;
2339 q->tin_index = precedence;
2340 q->tin_order = normal_order;
2341
2342 for (i = 0; i < q->tin_cnt; i++) {
2343 struct cake_tin_data *b = &q->tins[i];
2344
2345 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2346 us_to_ns(q->interval));
2347
2348 b->tin_quantum = max_t(u16, 1U, quantum);
2349
2350 /* calculate next class's parameters */
2351 rate *= 7;
2352 rate >>= 3;
2353
2354 quantum *= 7;
2355 quantum >>= 3;
2356 }
2357
2358 return 0;
2359 }
2360
2361 /* List of known Diffserv codepoints:
2362 *
2363 * Default Forwarding (DF/CS0) - Best Effort
2364 * Max Throughput (TOS2)
2365 * Min Delay (TOS4)
2366 * LLT "La" (TOS5)
2367 * Assured Forwarding 1 (AF1x) - x3
2368 * Assured Forwarding 2 (AF2x) - x3
2369 * Assured Forwarding 3 (AF3x) - x3
2370 * Assured Forwarding 4 (AF4x) - x3
2371 * Precedence Class 1 (CS1)
2372 * Precedence Class 2 (CS2)
2373 * Precedence Class 3 (CS3)
2374 * Precedence Class 4 (CS4)
2375 * Precedence Class 5 (CS5)
2376 * Precedence Class 6 (CS6)
2377 * Precedence Class 7 (CS7)
2378 * Voice Admit (VA)
2379 * Expedited Forwarding (EF)
2380 * Lower Effort (LE)
2381 *
2382 * Total 26 codepoints.
2383 */
2384
2385 /* List of traffic classes in RFC 4594, updated by RFC 8622:
2386 * (roughly descending order of contended priority)
2387 * (roughly ascending order of uncontended throughput)
2388 *
2389 * Network Control (CS6,CS7) - routing traffic
2390 * Telephony (EF,VA) - aka. VoIP streams
2391 * Signalling (CS5) - VoIP setup
2392 * Multimedia Conferencing (AF4x) - aka. video calls
2393 * Realtime Interactive (CS4) - eg. games
2394 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
2395 * Broadcast Video (CS3)
2396 * Low-Latency Data (AF2x,TOS4) - eg. database
2397 * Ops, Admin, Management (CS2) - eg. ssh
2398 * Standard Service (DF & unrecognised codepoints)
2399 * High-Throughput Data (AF1x,TOS2) - eg. web traffic
2400 * Low-Priority Data (LE,CS1) - eg. BitTorrent
2401 *
2402 * Total 12 traffic classes.
2403 */
2404
cake_config_diffserv8(struct Qdisc * sch)2405 static int cake_config_diffserv8(struct Qdisc *sch)
2406 {
2407 /* Pruned list of traffic classes for typical applications:
2408 *
2409 * Network Control (CS6, CS7)
2410 * Minimum Latency (EF, VA, CS5, CS4)
2411 * Interactive Shell (CS2)
2412 * Low Latency Transactions (AF2x, TOS4)
2413 * Video Streaming (AF4x, AF3x, CS3)
2414 * Bog Standard (DF etc.)
2415 * High Throughput (AF1x, TOS2, CS1)
2416 * Background Traffic (LE)
2417 *
2418 * Total 8 traffic classes.
2419 */
2420
2421 struct cake_sched_data *q = qdisc_priv(sch);
2422 u32 mtu = psched_mtu(qdisc_dev(sch));
2423 u64 rate = q->rate_bps;
2424 u32 quantum = 256;
2425 u32 i;
2426
2427 q->tin_cnt = 8;
2428
2429 /* codepoint to class mapping */
2430 q->tin_index = diffserv8;
2431 q->tin_order = normal_order;
2432
2433 /* class characteristics */
2434 for (i = 0; i < q->tin_cnt; i++) {
2435 struct cake_tin_data *b = &q->tins[i];
2436
2437 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2438 us_to_ns(q->interval));
2439
2440 b->tin_quantum = max_t(u16, 1U, quantum);
2441
2442 /* calculate next class's parameters */
2443 rate *= 7;
2444 rate >>= 3;
2445
2446 quantum *= 7;
2447 quantum >>= 3;
2448 }
2449
2450 return 0;
2451 }
2452
cake_config_diffserv4(struct Qdisc * sch)2453 static int cake_config_diffserv4(struct Qdisc *sch)
2454 {
2455 /* Further pruned list of traffic classes for four-class system:
2456 *
2457 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
2458 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2459 * Best Effort (DF, AF1x, TOS2, and those not specified)
2460 * Background Traffic (LE, CS1)
2461 *
2462 * Total 4 traffic classes.
2463 */
2464
2465 struct cake_sched_data *q = qdisc_priv(sch);
2466 u32 mtu = psched_mtu(qdisc_dev(sch));
2467 u64 rate = q->rate_bps;
2468 u32 quantum = 1024;
2469
2470 q->tin_cnt = 4;
2471
2472 /* codepoint to class mapping */
2473 q->tin_index = diffserv4;
2474 q->tin_order = bulk_order;
2475
2476 /* class characteristics */
2477 cake_set_rate(&q->tins[0], rate, mtu,
2478 us_to_ns(q->target), us_to_ns(q->interval));
2479 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2480 us_to_ns(q->target), us_to_ns(q->interval));
2481 cake_set_rate(&q->tins[2], rate >> 1, mtu,
2482 us_to_ns(q->target), us_to_ns(q->interval));
2483 cake_set_rate(&q->tins[3], rate >> 2, mtu,
2484 us_to_ns(q->target), us_to_ns(q->interval));
2485
2486 /* bandwidth-sharing weights */
2487 q->tins[0].tin_quantum = quantum;
2488 q->tins[1].tin_quantum = quantum >> 4;
2489 q->tins[2].tin_quantum = quantum >> 1;
2490 q->tins[3].tin_quantum = quantum >> 2;
2491
2492 return 0;
2493 }
2494
cake_config_diffserv3(struct Qdisc * sch)2495 static int cake_config_diffserv3(struct Qdisc *sch)
2496 {
2497 /* Simplified Diffserv structure with 3 tins.
2498 * Latency Sensitive (CS7, CS6, EF, VA, TOS4)
2499 * Best Effort
2500 * Low Priority (LE, CS1)
2501 */
2502 struct cake_sched_data *q = qdisc_priv(sch);
2503 u32 mtu = psched_mtu(qdisc_dev(sch));
2504 u64 rate = q->rate_bps;
2505 u32 quantum = 1024;
2506
2507 q->tin_cnt = 3;
2508
2509 /* codepoint to class mapping */
2510 q->tin_index = diffserv3;
2511 q->tin_order = bulk_order;
2512
2513 /* class characteristics */
2514 cake_set_rate(&q->tins[0], rate, mtu,
2515 us_to_ns(q->target), us_to_ns(q->interval));
2516 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2517 us_to_ns(q->target), us_to_ns(q->interval));
2518 cake_set_rate(&q->tins[2], rate >> 2, mtu,
2519 us_to_ns(q->target), us_to_ns(q->interval));
2520
2521 /* bandwidth-sharing weights */
2522 q->tins[0].tin_quantum = quantum;
2523 q->tins[1].tin_quantum = quantum >> 4;
2524 q->tins[2].tin_quantum = quantum >> 2;
2525
2526 return 0;
2527 }
2528
cake_reconfigure(struct Qdisc * sch)2529 static void cake_reconfigure(struct Qdisc *sch)
2530 {
2531 struct cake_sched_data *q = qdisc_priv(sch);
2532 int c, ft;
2533
2534 switch (q->tin_mode) {
2535 case CAKE_DIFFSERV_BESTEFFORT:
2536 ft = cake_config_besteffort(sch);
2537 break;
2538
2539 case CAKE_DIFFSERV_PRECEDENCE:
2540 ft = cake_config_precedence(sch);
2541 break;
2542
2543 case CAKE_DIFFSERV_DIFFSERV8:
2544 ft = cake_config_diffserv8(sch);
2545 break;
2546
2547 case CAKE_DIFFSERV_DIFFSERV4:
2548 ft = cake_config_diffserv4(sch);
2549 break;
2550
2551 case CAKE_DIFFSERV_DIFFSERV3:
2552 default:
2553 ft = cake_config_diffserv3(sch);
2554 break;
2555 }
2556
2557 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2558 cake_clear_tin(sch, c);
2559 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2560 }
2561
2562 q->rate_ns = q->tins[ft].tin_rate_ns;
2563 q->rate_shft = q->tins[ft].tin_rate_shft;
2564
2565 if (q->buffer_config_limit) {
2566 q->buffer_limit = q->buffer_config_limit;
2567 } else if (q->rate_bps) {
2568 u64 t = q->rate_bps * q->interval;
2569
2570 do_div(t, USEC_PER_SEC / 4);
2571 q->buffer_limit = max_t(u32, t, 4U << 20);
2572 } else {
2573 q->buffer_limit = ~0;
2574 }
2575
2576 sch->flags &= ~TCQ_F_CAN_BYPASS;
2577
2578 q->buffer_limit = min(q->buffer_limit,
2579 max(sch->limit * psched_mtu(qdisc_dev(sch)),
2580 q->buffer_config_limit));
2581 }
2582
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2583 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2584 struct netlink_ext_ack *extack)
2585 {
2586 struct cake_sched_data *q = qdisc_priv(sch);
2587 struct nlattr *tb[TCA_CAKE_MAX + 1];
2588 int err;
2589
2590 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2591 extack);
2592 if (err < 0)
2593 return err;
2594
2595 if (tb[TCA_CAKE_NAT]) {
2596 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2597 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2598 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2599 !!nla_get_u32(tb[TCA_CAKE_NAT]);
2600 #else
2601 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2602 "No conntrack support in kernel");
2603 return -EOPNOTSUPP;
2604 #endif
2605 }
2606
2607 if (tb[TCA_CAKE_BASE_RATE64])
2608 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2609
2610 if (tb[TCA_CAKE_DIFFSERV_MODE])
2611 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2612
2613 if (tb[TCA_CAKE_WASH]) {
2614 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2615 q->rate_flags |= CAKE_FLAG_WASH;
2616 else
2617 q->rate_flags &= ~CAKE_FLAG_WASH;
2618 }
2619
2620 if (tb[TCA_CAKE_FLOW_MODE])
2621 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2622 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2623 CAKE_FLOW_MASK));
2624
2625 if (tb[TCA_CAKE_ATM])
2626 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2627
2628 if (tb[TCA_CAKE_OVERHEAD]) {
2629 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2630 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2631
2632 q->max_netlen = 0;
2633 q->max_adjlen = 0;
2634 q->min_netlen = ~0;
2635 q->min_adjlen = ~0;
2636 }
2637
2638 if (tb[TCA_CAKE_RAW]) {
2639 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2640
2641 q->max_netlen = 0;
2642 q->max_adjlen = 0;
2643 q->min_netlen = ~0;
2644 q->min_adjlen = ~0;
2645 }
2646
2647 if (tb[TCA_CAKE_MPU])
2648 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2649
2650 if (tb[TCA_CAKE_RTT]) {
2651 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2652
2653 if (!q->interval)
2654 q->interval = 1;
2655 }
2656
2657 if (tb[TCA_CAKE_TARGET]) {
2658 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2659
2660 if (!q->target)
2661 q->target = 1;
2662 }
2663
2664 if (tb[TCA_CAKE_AUTORATE]) {
2665 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2666 q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2667 else
2668 q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2669 }
2670
2671 if (tb[TCA_CAKE_INGRESS]) {
2672 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2673 q->rate_flags |= CAKE_FLAG_INGRESS;
2674 else
2675 q->rate_flags &= ~CAKE_FLAG_INGRESS;
2676 }
2677
2678 if (tb[TCA_CAKE_ACK_FILTER])
2679 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2680
2681 if (tb[TCA_CAKE_MEMORY])
2682 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2683
2684 if (tb[TCA_CAKE_SPLIT_GSO]) {
2685 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2686 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2687 else
2688 q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2689 }
2690
2691 if (tb[TCA_CAKE_FWMARK]) {
2692 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2693 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2694 }
2695
2696 if (q->tins) {
2697 sch_tree_lock(sch);
2698 cake_reconfigure(sch);
2699 sch_tree_unlock(sch);
2700 }
2701
2702 return 0;
2703 }
2704
cake_destroy(struct Qdisc * sch)2705 static void cake_destroy(struct Qdisc *sch)
2706 {
2707 struct cake_sched_data *q = qdisc_priv(sch);
2708
2709 qdisc_watchdog_cancel(&q->watchdog);
2710 tcf_block_put(q->block);
2711 kvfree(q->tins);
2712 }
2713
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2714 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2715 struct netlink_ext_ack *extack)
2716 {
2717 struct cake_sched_data *q = qdisc_priv(sch);
2718 int i, j, err;
2719
2720 sch->limit = 10240;
2721 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2722 q->flow_mode = CAKE_FLOW_TRIPLE;
2723
2724 q->rate_bps = 0; /* unlimited by default */
2725
2726 q->interval = 100000; /* 100ms default */
2727 q->target = 5000; /* 5ms: codel RFC argues
2728 * for 5 to 10% of interval
2729 */
2730 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2731 q->cur_tin = 0;
2732 q->cur_flow = 0;
2733
2734 qdisc_watchdog_init(&q->watchdog, sch);
2735
2736 if (opt) {
2737 err = cake_change(sch, opt, extack);
2738
2739 if (err)
2740 return err;
2741 }
2742
2743 err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2744 if (err)
2745 return err;
2746
2747 quantum_div[0] = ~0;
2748 for (i = 1; i <= CAKE_QUEUES; i++)
2749 quantum_div[i] = 65535 / i;
2750
2751 q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2752 GFP_KERNEL);
2753 if (!q->tins)
2754 return -ENOMEM;
2755
2756 for (i = 0; i < CAKE_MAX_TINS; i++) {
2757 struct cake_tin_data *b = q->tins + i;
2758
2759 INIT_LIST_HEAD(&b->new_flows);
2760 INIT_LIST_HEAD(&b->old_flows);
2761 INIT_LIST_HEAD(&b->decaying_flows);
2762 b->sparse_flow_count = 0;
2763 b->bulk_flow_count = 0;
2764 b->decaying_flow_count = 0;
2765
2766 for (j = 0; j < CAKE_QUEUES; j++) {
2767 struct cake_flow *flow = b->flows + j;
2768 u32 k = j * CAKE_MAX_TINS + i;
2769
2770 INIT_LIST_HEAD(&flow->flowchain);
2771 cobalt_vars_init(&flow->cvars);
2772
2773 q->overflow_heap[k].t = i;
2774 q->overflow_heap[k].b = j;
2775 b->overflow_idx[j] = k;
2776 }
2777 }
2778
2779 cake_reconfigure(sch);
2780 q->avg_peak_bandwidth = q->rate_bps;
2781 q->min_netlen = ~0;
2782 q->min_adjlen = ~0;
2783 return 0;
2784 }
2785
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2786 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2787 {
2788 struct cake_sched_data *q = qdisc_priv(sch);
2789 struct nlattr *opts;
2790
2791 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2792 if (!opts)
2793 goto nla_put_failure;
2794
2795 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2796 TCA_CAKE_PAD))
2797 goto nla_put_failure;
2798
2799 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2800 q->flow_mode & CAKE_FLOW_MASK))
2801 goto nla_put_failure;
2802
2803 if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2804 goto nla_put_failure;
2805
2806 if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2807 goto nla_put_failure;
2808
2809 if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2810 goto nla_put_failure;
2811
2812 if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2813 !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2814 goto nla_put_failure;
2815
2816 if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2817 !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2818 goto nla_put_failure;
2819
2820 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2821 goto nla_put_failure;
2822
2823 if (nla_put_u32(skb, TCA_CAKE_NAT,
2824 !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2825 goto nla_put_failure;
2826
2827 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2828 goto nla_put_failure;
2829
2830 if (nla_put_u32(skb, TCA_CAKE_WASH,
2831 !!(q->rate_flags & CAKE_FLAG_WASH)))
2832 goto nla_put_failure;
2833
2834 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2835 goto nla_put_failure;
2836
2837 if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2838 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2839 goto nla_put_failure;
2840
2841 if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2842 goto nla_put_failure;
2843
2844 if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2845 goto nla_put_failure;
2846
2847 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2848 !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2849 goto nla_put_failure;
2850
2851 if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2852 goto nla_put_failure;
2853
2854 return nla_nest_end(skb, opts);
2855
2856 nla_put_failure:
2857 return -1;
2858 }
2859
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2860 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2861 {
2862 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2863 struct cake_sched_data *q = qdisc_priv(sch);
2864 struct nlattr *tstats, *ts;
2865 int i;
2866
2867 if (!stats)
2868 return -1;
2869
2870 #define PUT_STAT_U32(attr, data) do { \
2871 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2872 goto nla_put_failure; \
2873 } while (0)
2874 #define PUT_STAT_U64(attr, data) do { \
2875 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2876 data, TCA_CAKE_STATS_PAD)) \
2877 goto nla_put_failure; \
2878 } while (0)
2879
2880 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2881 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2882 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2883 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2884 PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2885 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2886 PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2887 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2888
2889 #undef PUT_STAT_U32
2890 #undef PUT_STAT_U64
2891
2892 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2893 if (!tstats)
2894 goto nla_put_failure;
2895
2896 #define PUT_TSTAT_U32(attr, data) do { \
2897 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2898 goto nla_put_failure; \
2899 } while (0)
2900 #define PUT_TSTAT_U64(attr, data) do { \
2901 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2902 data, TCA_CAKE_TIN_STATS_PAD)) \
2903 goto nla_put_failure; \
2904 } while (0)
2905
2906 for (i = 0; i < q->tin_cnt; i++) {
2907 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2908
2909 ts = nla_nest_start_noflag(d->skb, i + 1);
2910 if (!ts)
2911 goto nla_put_failure;
2912
2913 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2914 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2915 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2916
2917 PUT_TSTAT_U32(TARGET_US,
2918 ktime_to_us(ns_to_ktime(b->cparams.target)));
2919 PUT_TSTAT_U32(INTERVAL_US,
2920 ktime_to_us(ns_to_ktime(b->cparams.interval)));
2921
2922 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2923 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2924 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2925 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2926
2927 PUT_TSTAT_U32(PEAK_DELAY_US,
2928 ktime_to_us(ns_to_ktime(b->peak_delay)));
2929 PUT_TSTAT_U32(AVG_DELAY_US,
2930 ktime_to_us(ns_to_ktime(b->avge_delay)));
2931 PUT_TSTAT_U32(BASE_DELAY_US,
2932 ktime_to_us(ns_to_ktime(b->base_delay)));
2933
2934 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2935 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2936 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2937
2938 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2939 b->decaying_flow_count);
2940 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2941 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2942 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2943
2944 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2945 nla_nest_end(d->skb, ts);
2946 }
2947
2948 #undef PUT_TSTAT_U32
2949 #undef PUT_TSTAT_U64
2950
2951 nla_nest_end(d->skb, tstats);
2952 return nla_nest_end(d->skb, stats);
2953
2954 nla_put_failure:
2955 nla_nest_cancel(d->skb, stats);
2956 return -1;
2957 }
2958
cake_leaf(struct Qdisc * sch,unsigned long arg)2959 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2960 {
2961 return NULL;
2962 }
2963
cake_find(struct Qdisc * sch,u32 classid)2964 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2965 {
2966 return 0;
2967 }
2968
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2969 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2970 u32 classid)
2971 {
2972 return 0;
2973 }
2974
cake_unbind(struct Qdisc * q,unsigned long cl)2975 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2976 {
2977 }
2978
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2979 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2980 struct netlink_ext_ack *extack)
2981 {
2982 struct cake_sched_data *q = qdisc_priv(sch);
2983
2984 if (cl)
2985 return NULL;
2986 return q->block;
2987 }
2988
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2989 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2990 struct sk_buff *skb, struct tcmsg *tcm)
2991 {
2992 tcm->tcm_handle |= TC_H_MIN(cl);
2993 return 0;
2994 }
2995
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2996 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2997 struct gnet_dump *d)
2998 {
2999 struct cake_sched_data *q = qdisc_priv(sch);
3000 const struct cake_flow *flow = NULL;
3001 struct gnet_stats_queue qs = { 0 };
3002 struct nlattr *stats;
3003 u32 idx = cl - 1;
3004
3005 if (idx < CAKE_QUEUES * q->tin_cnt) {
3006 const struct cake_tin_data *b = \
3007 &q->tins[q->tin_order[idx / CAKE_QUEUES]];
3008 const struct sk_buff *skb;
3009
3010 flow = &b->flows[idx % CAKE_QUEUES];
3011
3012 if (flow->head) {
3013 sch_tree_lock(sch);
3014 skb = flow->head;
3015 while (skb) {
3016 qs.qlen++;
3017 skb = skb->next;
3018 }
3019 sch_tree_unlock(sch);
3020 }
3021 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3022 qs.drops = flow->dropped;
3023 }
3024 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3025 return -1;
3026 if (flow) {
3027 ktime_t now = ktime_get();
3028
3029 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3030 if (!stats)
3031 return -1;
3032
3033 #define PUT_STAT_U32(attr, data) do { \
3034 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3035 goto nla_put_failure; \
3036 } while (0)
3037 #define PUT_STAT_S32(attr, data) do { \
3038 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3039 goto nla_put_failure; \
3040 } while (0)
3041
3042 PUT_STAT_S32(DEFICIT, flow->deficit);
3043 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3044 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3045 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3046 if (flow->cvars.p_drop) {
3047 PUT_STAT_S32(BLUE_TIMER_US,
3048 ktime_to_us(
3049 ktime_sub(now,
3050 flow->cvars.blue_timer)));
3051 }
3052 if (flow->cvars.dropping) {
3053 PUT_STAT_S32(DROP_NEXT_US,
3054 ktime_to_us(
3055 ktime_sub(now,
3056 flow->cvars.drop_next)));
3057 }
3058
3059 if (nla_nest_end(d->skb, stats) < 0)
3060 return -1;
3061 }
3062
3063 return 0;
3064
3065 nla_put_failure:
3066 nla_nest_cancel(d->skb, stats);
3067 return -1;
3068 }
3069
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3070 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3071 {
3072 struct cake_sched_data *q = qdisc_priv(sch);
3073 unsigned int i, j;
3074
3075 if (arg->stop)
3076 return;
3077
3078 for (i = 0; i < q->tin_cnt; i++) {
3079 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3080
3081 for (j = 0; j < CAKE_QUEUES; j++) {
3082 if (list_empty(&b->flows[j].flowchain)) {
3083 arg->count++;
3084 continue;
3085 }
3086 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3087 arg))
3088 break;
3089 }
3090 }
3091 }
3092
3093 static const struct Qdisc_class_ops cake_class_ops = {
3094 .leaf = cake_leaf,
3095 .find = cake_find,
3096 .tcf_block = cake_tcf_block,
3097 .bind_tcf = cake_bind,
3098 .unbind_tcf = cake_unbind,
3099 .dump = cake_dump_class,
3100 .dump_stats = cake_dump_class_stats,
3101 .walk = cake_walk,
3102 };
3103
3104 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3105 .cl_ops = &cake_class_ops,
3106 .id = "cake",
3107 .priv_size = sizeof(struct cake_sched_data),
3108 .enqueue = cake_enqueue,
3109 .dequeue = cake_dequeue,
3110 .peek = qdisc_peek_dequeued,
3111 .init = cake_init,
3112 .reset = cake_reset,
3113 .destroy = cake_destroy,
3114 .change = cake_change,
3115 .dump = cake_dump,
3116 .dump_stats = cake_dump_stats,
3117 .owner = THIS_MODULE,
3118 };
3119
cake_module_init(void)3120 static int __init cake_module_init(void)
3121 {
3122 return register_qdisc(&cake_qdisc_ops);
3123 }
3124
cake_module_exit(void)3125 static void __exit cake_module_exit(void)
3126 {
3127 unregister_qdisc(&cake_qdisc_ops);
3128 }
3129
3130 module_init(cake_module_init)
3131 module_exit(cake_module_exit)
3132 MODULE_AUTHOR("Jonathan Morton");
3133 MODULE_LICENSE("Dual BSD/GPL");
3134 MODULE_DESCRIPTION("The CAKE shaper.");
3135