xref: /openbmc/linux/net/sched/sch_cake.c (revision d4a9039a)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 #define REC_INV_SQRT_CACHE (16)
365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366 
367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369  *
370  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371  */
372 
cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars)
374 {
375 	u32 invsqrt, invsqrt2;
376 	u64 val;
377 
378 	invsqrt = vars->rec_inv_sqrt;
379 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381 
382 	val >>= 2; /* avoid overflow in following multiply */
383 	val = (val * invsqrt) >> (32 - 2 + 1);
384 
385 	vars->rec_inv_sqrt = val;
386 }
387 
cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars)
389 {
390 	if (vars->count < REC_INV_SQRT_CACHE)
391 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392 	else
393 		cobalt_newton_step(vars);
394 }
395 
396 /* There is a big difference in timing between the accurate values placed in
397  * the cache and the approximations given by a single Newton step for small
398  * count values, particularly when stepping from count 1 to 2 or vice versa.
399  * Above 16, a single Newton step gives sufficient accuracy in either
400  * direction, given the precision stored.
401  *
402  * The magnitude of the error when stepping up to count 2 is such as to give
403  * the value that *should* have been produced at count 4.
404  */
405 
cobalt_cache_init(void)406 static void cobalt_cache_init(void)
407 {
408 	struct cobalt_vars v;
409 
410 	memset(&v, 0, sizeof(v));
411 	v.rec_inv_sqrt = ~0U;
412 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413 
414 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 		cobalt_newton_step(&v);
419 
420 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421 	}
422 }
423 
cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 	memset(vars, 0, sizeof(*vars));
427 
428 	if (!cobalt_rec_inv_sqrt_cache[0]) {
429 		cobalt_cache_init();
430 		cobalt_rec_inv_sqrt_cache[0] = ~0;
431 	}
432 }
433 
434 /* CoDel control_law is t + interval/sqrt(count)
435  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436  * both sqrt() and divide operation.
437  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t,
439 			      u64 interval,
440 			      u32 rec_inv_sqrt)
441 {
442 	return ktime_add_ns(t, reciprocal_scale(interval,
443 						rec_inv_sqrt));
444 }
445 
446 /* Call this when a packet had to be dropped due to queue overflow.  Returns
447  * true if the BLUE state was quiescent before but active after this call.
448  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars,
450 			      struct cobalt_params *p,
451 			      ktime_t now)
452 {
453 	bool up = false;
454 
455 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456 		up = !vars->p_drop;
457 		vars->p_drop += p->p_inc;
458 		if (vars->p_drop < p->p_inc)
459 			vars->p_drop = ~0;
460 		vars->blue_timer = now;
461 	}
462 	vars->dropping = true;
463 	vars->drop_next = now;
464 	if (!vars->count)
465 		vars->count = 1;
466 
467 	return up;
468 }
469 
470 /* Call this when the queue was serviced but turned out to be empty.  Returns
471  * true if the BLUE state was active before but quiescent after this call.
472  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
474 			       struct cobalt_params *p,
475 			       ktime_t now)
476 {
477 	bool down = false;
478 
479 	if (vars->p_drop &&
480 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481 		if (vars->p_drop < p->p_dec)
482 			vars->p_drop = 0;
483 		else
484 			vars->p_drop -= p->p_dec;
485 		vars->blue_timer = now;
486 		down = !vars->p_drop;
487 	}
488 	vars->dropping = false;
489 
490 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491 		vars->count--;
492 		cobalt_invsqrt(vars);
493 		vars->drop_next = cobalt_control(vars->drop_next,
494 						 p->interval,
495 						 vars->rec_inv_sqrt);
496 	}
497 
498 	return down;
499 }
500 
501 /* Call this with a freshly dequeued packet for possible congestion marking.
502  * Returns true as an instruction to drop the packet, false for delivery.
503  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars,
505 			       struct cobalt_params *p,
506 			       ktime_t now,
507 			       struct sk_buff *skb,
508 			       u32 bulk_flows)
509 {
510 	bool next_due, over_target, drop = false;
511 	ktime_t schedule;
512 	u64 sojourn;
513 
514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
515  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516  * scheduling decision is actually branched, without destroying that
517  * information.  Similarly, the first 'schedule' value calculated is preserved
518  * in the boolean 'next_due'.
519  *
520  * As for 'drop_next', we take advantage of the fact that 'interval' is both
521  * the delay between first exceeding 'target' and the first signalling event,
522  * *and* the scaling factor for the signalling frequency.  It's therefore very
523  * natural to use a single mechanism for both purposes, and eliminates a
524  * significant amount of reference Codel's spaghetti code.  To help with this,
525  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526  * as possible to 1.0 in fixed-point.
527  */
528 
529 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530 	schedule = ktime_sub(now, vars->drop_next);
531 	over_target = sojourn > p->target &&
532 		      sojourn > p->mtu_time * bulk_flows * 2 &&
533 		      sojourn > p->mtu_time * 4;
534 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535 
536 	vars->ecn_marked = false;
537 
538 	if (over_target) {
539 		if (!vars->dropping) {
540 			vars->dropping = true;
541 			vars->drop_next = cobalt_control(now,
542 							 p->interval,
543 							 vars->rec_inv_sqrt);
544 		}
545 		if (!vars->count)
546 			vars->count = 1;
547 	} else if (vars->dropping) {
548 		vars->dropping = false;
549 	}
550 
551 	if (next_due && vars->dropping) {
552 		/* Use ECN mark if possible, otherwise drop */
553 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554 
555 		vars->count++;
556 		if (!vars->count)
557 			vars->count--;
558 		cobalt_invsqrt(vars);
559 		vars->drop_next = cobalt_control(vars->drop_next,
560 						 p->interval,
561 						 vars->rec_inv_sqrt);
562 		schedule = ktime_sub(now, vars->drop_next);
563 	} else {
564 		while (next_due) {
565 			vars->count--;
566 			cobalt_invsqrt(vars);
567 			vars->drop_next = cobalt_control(vars->drop_next,
568 							 p->interval,
569 							 vars->rec_inv_sqrt);
570 			schedule = ktime_sub(now, vars->drop_next);
571 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572 		}
573 	}
574 
575 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576 	if (vars->p_drop)
577 		drop |= (get_random_u32() < vars->p_drop);
578 
579 	/* Overload the drop_next field as an activity timeout */
580 	if (!vars->count)
581 		vars->drop_next = ktime_add_ns(now, p->interval);
582 	else if (ktime_to_ns(schedule) > 0 && !drop)
583 		vars->drop_next = now;
584 
585 	return drop;
586 }
587 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static bool cake_update_flowkeys(struct flow_keys *keys,
589 				 const struct sk_buff *skb)
590 {
591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592 	struct nf_conntrack_tuple tuple = {};
593 	bool rev = !skb->_nfct, upd = false;
594 	__be32 ip;
595 
596 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
597 		return false;
598 
599 	if (!nf_ct_get_tuple_skb(&tuple, skb))
600 		return false;
601 
602 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603 	if (ip != keys->addrs.v4addrs.src) {
604 		keys->addrs.v4addrs.src = ip;
605 		upd = true;
606 	}
607 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608 	if (ip != keys->addrs.v4addrs.dst) {
609 		keys->addrs.v4addrs.dst = ip;
610 		upd = true;
611 	}
612 
613 	if (keys->ports.ports) {
614 		__be16 port;
615 
616 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
617 		if (port != keys->ports.src) {
618 			keys->ports.src = port;
619 			upd = true;
620 		}
621 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
622 		if (port != keys->ports.dst) {
623 			port = keys->ports.dst;
624 			upd = true;
625 		}
626 	}
627 	return upd;
628 #else
629 	return false;
630 #endif
631 }
632 
633 /* Cake has several subtle multiple bit settings. In these cases you
634  *  would be matching triple isolate mode as well.
635  */
636 
cake_dsrc(int flow_mode)637 static bool cake_dsrc(int flow_mode)
638 {
639 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640 }
641 
cake_ddst(int flow_mode)642 static bool cake_ddst(int flow_mode)
643 {
644 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645 }
646 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)647 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
648 		     int flow_mode, u16 flow_override, u16 host_override)
649 {
650 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
651 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
652 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
653 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
654 	u16 reduced_hash, srchost_idx, dsthost_idx;
655 	struct flow_keys keys, host_keys;
656 	bool use_skbhash = skb->l4_hash;
657 
658 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
659 		return 0;
660 
661 	/* If both overrides are set, or we can use the SKB hash and nat mode is
662 	 * disabled, we can skip packet dissection entirely. If nat mode is
663 	 * enabled there's another check below after doing the conntrack lookup.
664 	 */
665 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
666 		goto skip_hash;
667 
668 	skb_flow_dissect_flow_keys(skb, &keys,
669 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
670 
671 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
672 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
673 		use_skbhash = false;
674 
675 	/* If we can still use the SKB hash and don't need the host hash, we can
676 	 * skip the rest of the hashing procedure
677 	 */
678 	if (use_skbhash && !hash_hosts)
679 		goto skip_hash;
680 
681 	/* flow_hash_from_keys() sorts the addresses by value, so we have
682 	 * to preserve their order in a separate data structure to treat
683 	 * src and dst host addresses as independently selectable.
684 	 */
685 	host_keys = keys;
686 	host_keys.ports.ports     = 0;
687 	host_keys.basic.ip_proto  = 0;
688 	host_keys.keyid.keyid     = 0;
689 	host_keys.tags.flow_label = 0;
690 
691 	switch (host_keys.control.addr_type) {
692 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
693 		host_keys.addrs.v4addrs.src = 0;
694 		dsthost_hash = flow_hash_from_keys(&host_keys);
695 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
696 		host_keys.addrs.v4addrs.dst = 0;
697 		srchost_hash = flow_hash_from_keys(&host_keys);
698 		break;
699 
700 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
701 		memset(&host_keys.addrs.v6addrs.src, 0,
702 		       sizeof(host_keys.addrs.v6addrs.src));
703 		dsthost_hash = flow_hash_from_keys(&host_keys);
704 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
705 		memset(&host_keys.addrs.v6addrs.dst, 0,
706 		       sizeof(host_keys.addrs.v6addrs.dst));
707 		srchost_hash = flow_hash_from_keys(&host_keys);
708 		break;
709 
710 	default:
711 		dsthost_hash = 0;
712 		srchost_hash = 0;
713 	}
714 
715 	/* This *must* be after the above switch, since as a
716 	 * side-effect it sorts the src and dst addresses.
717 	 */
718 	if (hash_flows && !use_skbhash)
719 		flow_hash = flow_hash_from_keys(&keys);
720 
721 skip_hash:
722 	if (flow_override)
723 		flow_hash = flow_override - 1;
724 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
725 		flow_hash = skb->hash;
726 	if (host_override) {
727 		dsthost_hash = host_override - 1;
728 		srchost_hash = host_override - 1;
729 	}
730 
731 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
732 		if (flow_mode & CAKE_FLOW_SRC_IP)
733 			flow_hash ^= srchost_hash;
734 
735 		if (flow_mode & CAKE_FLOW_DST_IP)
736 			flow_hash ^= dsthost_hash;
737 	}
738 
739 	reduced_hash = flow_hash % CAKE_QUEUES;
740 
741 	/* set-associative hashing */
742 	/* fast path if no hash collision (direct lookup succeeds) */
743 	if (likely(q->tags[reduced_hash] == flow_hash &&
744 		   q->flows[reduced_hash].set)) {
745 		q->way_directs++;
746 	} else {
747 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
748 		u32 outer_hash = reduced_hash - inner_hash;
749 		bool allocate_src = false;
750 		bool allocate_dst = false;
751 		u32 i, k;
752 
753 		/* check if any active queue in the set is reserved for
754 		 * this flow.
755 		 */
756 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
757 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
758 			if (q->tags[outer_hash + k] == flow_hash) {
759 				if (i)
760 					q->way_hits++;
761 
762 				if (!q->flows[outer_hash + k].set) {
763 					/* need to increment host refcnts */
764 					allocate_src = cake_dsrc(flow_mode);
765 					allocate_dst = cake_ddst(flow_mode);
766 				}
767 
768 				goto found;
769 			}
770 		}
771 
772 		/* no queue is reserved for this flow, look for an
773 		 * empty one.
774 		 */
775 		for (i = 0; i < CAKE_SET_WAYS;
776 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
777 			if (!q->flows[outer_hash + k].set) {
778 				q->way_misses++;
779 				allocate_src = cake_dsrc(flow_mode);
780 				allocate_dst = cake_ddst(flow_mode);
781 				goto found;
782 			}
783 		}
784 
785 		/* With no empty queues, default to the original
786 		 * queue, accept the collision, update the host tags.
787 		 */
788 		q->way_collisions++;
789 		allocate_src = cake_dsrc(flow_mode);
790 		allocate_dst = cake_ddst(flow_mode);
791 
792 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
793 			if (allocate_src)
794 				q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
795 			if (allocate_dst)
796 				q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
797 		}
798 found:
799 		/* reserve queue for future packets in same flow */
800 		reduced_hash = outer_hash + k;
801 		q->tags[reduced_hash] = flow_hash;
802 
803 		if (allocate_src) {
804 			srchost_idx = srchost_hash % CAKE_QUEUES;
805 			inner_hash = srchost_idx % CAKE_SET_WAYS;
806 			outer_hash = srchost_idx - inner_hash;
807 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
808 				i++, k = (k + 1) % CAKE_SET_WAYS) {
809 				if (q->hosts[outer_hash + k].srchost_tag ==
810 				    srchost_hash)
811 					goto found_src;
812 			}
813 			for (i = 0; i < CAKE_SET_WAYS;
814 				i++, k = (k + 1) % CAKE_SET_WAYS) {
815 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
816 					break;
817 			}
818 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
819 found_src:
820 			srchost_idx = outer_hash + k;
821 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
822 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
823 			q->flows[reduced_hash].srchost = srchost_idx;
824 		}
825 
826 		if (allocate_dst) {
827 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
828 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
829 			outer_hash = dsthost_idx - inner_hash;
830 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
831 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
832 				if (q->hosts[outer_hash + k].dsthost_tag ==
833 				    dsthost_hash)
834 					goto found_dst;
835 			}
836 			for (i = 0; i < CAKE_SET_WAYS;
837 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
838 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
839 					break;
840 			}
841 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
842 found_dst:
843 			dsthost_idx = outer_hash + k;
844 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
845 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
846 			q->flows[reduced_hash].dsthost = dsthost_idx;
847 		}
848 	}
849 
850 	return reduced_hash;
851 }
852 
853 /* helper functions : might be changed when/if skb use a standard list_head */
854 /* remove one skb from head of slot queue */
855 
dequeue_head(struct cake_flow * flow)856 static struct sk_buff *dequeue_head(struct cake_flow *flow)
857 {
858 	struct sk_buff *skb = flow->head;
859 
860 	if (skb) {
861 		flow->head = skb->next;
862 		skb_mark_not_on_list(skb);
863 	}
864 
865 	return skb;
866 }
867 
868 /* add skb to flow queue (tail add) */
869 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)870 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
871 {
872 	if (!flow->head)
873 		flow->head = skb;
874 	else
875 		flow->tail->next = skb;
876 	flow->tail = skb;
877 	skb->next = NULL;
878 }
879 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)880 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
881 				    struct ipv6hdr *buf)
882 {
883 	unsigned int offset = skb_network_offset(skb);
884 	struct iphdr *iph;
885 
886 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
887 
888 	if (!iph)
889 		return NULL;
890 
891 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
892 		return skb_header_pointer(skb, offset + iph->ihl * 4,
893 					  sizeof(struct ipv6hdr), buf);
894 
895 	else if (iph->version == 4)
896 		return iph;
897 
898 	else if (iph->version == 6)
899 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
900 					  buf);
901 
902 	return NULL;
903 }
904 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)905 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
906 				      void *buf, unsigned int bufsize)
907 {
908 	unsigned int offset = skb_network_offset(skb);
909 	const struct ipv6hdr *ipv6h;
910 	const struct tcphdr *tcph;
911 	const struct iphdr *iph;
912 	struct ipv6hdr _ipv6h;
913 	struct tcphdr _tcph;
914 
915 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
916 
917 	if (!ipv6h)
918 		return NULL;
919 
920 	if (ipv6h->version == 4) {
921 		iph = (struct iphdr *)ipv6h;
922 		offset += iph->ihl * 4;
923 
924 		/* special-case 6in4 tunnelling, as that is a common way to get
925 		 * v6 connectivity in the home
926 		 */
927 		if (iph->protocol == IPPROTO_IPV6) {
928 			ipv6h = skb_header_pointer(skb, offset,
929 						   sizeof(_ipv6h), &_ipv6h);
930 
931 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
932 				return NULL;
933 
934 			offset += sizeof(struct ipv6hdr);
935 
936 		} else if (iph->protocol != IPPROTO_TCP) {
937 			return NULL;
938 		}
939 
940 	} else if (ipv6h->version == 6) {
941 		if (ipv6h->nexthdr != IPPROTO_TCP)
942 			return NULL;
943 
944 		offset += sizeof(struct ipv6hdr);
945 	} else {
946 		return NULL;
947 	}
948 
949 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
950 	if (!tcph || tcph->doff < 5)
951 		return NULL;
952 
953 	return skb_header_pointer(skb, offset,
954 				  min(__tcp_hdrlen(tcph), bufsize), buf);
955 }
956 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)957 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
958 				   int code, int *oplen)
959 {
960 	/* inspired by tcp_parse_options in tcp_input.c */
961 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
962 	const u8 *ptr = (const u8 *)(tcph + 1);
963 
964 	while (length > 0) {
965 		int opcode = *ptr++;
966 		int opsize;
967 
968 		if (opcode == TCPOPT_EOL)
969 			break;
970 		if (opcode == TCPOPT_NOP) {
971 			length--;
972 			continue;
973 		}
974 		if (length < 2)
975 			break;
976 		opsize = *ptr++;
977 		if (opsize < 2 || opsize > length)
978 			break;
979 
980 		if (opcode == code) {
981 			*oplen = opsize;
982 			return ptr;
983 		}
984 
985 		ptr += opsize - 2;
986 		length -= opsize;
987 	}
988 
989 	return NULL;
990 }
991 
992 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
993  * bytes than the other. In the case where both sequences ACKs bytes that the
994  * other doesn't, A is considered greater. DSACKs in A also makes A be
995  * considered greater.
996  *
997  * @return -1, 0 or 1 as normal compare functions
998  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)999 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1000 				  const struct tcphdr *tcph_b)
1001 {
1002 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1003 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1004 	u32 bytes_a = 0, bytes_b = 0;
1005 	int oplen_a, oplen_b;
1006 	bool first = true;
1007 
1008 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1009 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1010 
1011 	/* pointers point to option contents */
1012 	oplen_a -= TCPOLEN_SACK_BASE;
1013 	oplen_b -= TCPOLEN_SACK_BASE;
1014 
1015 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1016 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1017 		return -1;
1018 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1019 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1020 		return 1;
1021 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1022 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1023 		return 0;
1024 
1025 	while (oplen_a >= sizeof(*sack_a)) {
1026 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1027 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1028 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1029 		int oplen_tmp = oplen_b;
1030 		bool found = false;
1031 
1032 		/* DSACK; always considered greater to prevent dropping */
1033 		if (before(start_a, ack_seq_a))
1034 			return -1;
1035 
1036 		bytes_a += end_a - start_a;
1037 
1038 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1039 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1040 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1041 
1042 			/* first time through we count the total size */
1043 			if (first)
1044 				bytes_b += end_b - start_b;
1045 
1046 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1047 				found = true;
1048 				if (!first)
1049 					break;
1050 			}
1051 			oplen_tmp -= sizeof(*sack_tmp);
1052 			sack_tmp++;
1053 		}
1054 
1055 		if (!found)
1056 			return -1;
1057 
1058 		oplen_a -= sizeof(*sack_a);
1059 		sack_a++;
1060 		first = false;
1061 	}
1062 
1063 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1064 	 * either the SACKs are equal, or B SACKs more bytes.
1065 	 */
1066 	return bytes_b > bytes_a ? 1 : 0;
1067 }
1068 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1069 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1070 				 u32 *tsval, u32 *tsecr)
1071 {
1072 	const u8 *ptr;
1073 	int opsize;
1074 
1075 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1076 
1077 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1078 		*tsval = get_unaligned_be32(ptr);
1079 		*tsecr = get_unaligned_be32(ptr + 4);
1080 	}
1081 }
1082 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1083 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1084 			       u32 tstamp_new, u32 tsecr_new)
1085 {
1086 	/* inspired by tcp_parse_options in tcp_input.c */
1087 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1088 	const u8 *ptr = (const u8 *)(tcph + 1);
1089 	u32 tstamp, tsecr;
1090 
1091 	/* 3 reserved flags must be unset to avoid future breakage
1092 	 * ACK must be set
1093 	 * ECE/CWR are handled separately
1094 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1095 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1096 	 * 0x00C00000 = CWR/ECE (handled separately)
1097 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1098 	 */
1099 	if (((tcp_flag_word(tcph) &
1100 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1101 		return false;
1102 
1103 	while (length > 0) {
1104 		int opcode = *ptr++;
1105 		int opsize;
1106 
1107 		if (opcode == TCPOPT_EOL)
1108 			break;
1109 		if (opcode == TCPOPT_NOP) {
1110 			length--;
1111 			continue;
1112 		}
1113 		if (length < 2)
1114 			break;
1115 		opsize = *ptr++;
1116 		if (opsize < 2 || opsize > length)
1117 			break;
1118 
1119 		switch (opcode) {
1120 		case TCPOPT_MD5SIG: /* doesn't influence state */
1121 			break;
1122 
1123 		case TCPOPT_SACK: /* stricter checking performed later */
1124 			if (opsize % 8 != 2)
1125 				return false;
1126 			break;
1127 
1128 		case TCPOPT_TIMESTAMP:
1129 			/* only drop timestamps lower than new */
1130 			if (opsize != TCPOLEN_TIMESTAMP)
1131 				return false;
1132 			tstamp = get_unaligned_be32(ptr);
1133 			tsecr = get_unaligned_be32(ptr + 4);
1134 			if (after(tstamp, tstamp_new) ||
1135 			    after(tsecr, tsecr_new))
1136 				return false;
1137 			break;
1138 
1139 		case TCPOPT_MSS:  /* these should only be set on SYN */
1140 		case TCPOPT_WINDOW:
1141 		case TCPOPT_SACK_PERM:
1142 		case TCPOPT_FASTOPEN:
1143 		case TCPOPT_EXP:
1144 		default: /* don't drop if any unknown options are present */
1145 			return false;
1146 		}
1147 
1148 		ptr += opsize - 2;
1149 		length -= opsize;
1150 	}
1151 
1152 	return true;
1153 }
1154 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1155 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1156 				       struct cake_flow *flow)
1157 {
1158 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1159 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1160 	struct sk_buff *skb_check, *skb_prev = NULL;
1161 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1162 	unsigned char _tcph[64], _tcph_check[64];
1163 	const struct tcphdr *tcph, *tcph_check;
1164 	const struct iphdr *iph, *iph_check;
1165 	struct ipv6hdr _iph, _iph_check;
1166 	const struct sk_buff *skb;
1167 	int seglen, num_found = 0;
1168 	u32 tstamp = 0, tsecr = 0;
1169 	__be32 elig_flags = 0;
1170 	int sack_comp;
1171 
1172 	/* no other possible ACKs to filter */
1173 	if (flow->head == flow->tail)
1174 		return NULL;
1175 
1176 	skb = flow->tail;
1177 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1178 	iph = cake_get_iphdr(skb, &_iph);
1179 	if (!tcph)
1180 		return NULL;
1181 
1182 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1183 
1184 	/* the 'triggering' packet need only have the ACK flag set.
1185 	 * also check that SYN is not set, as there won't be any previous ACKs.
1186 	 */
1187 	if ((tcp_flag_word(tcph) &
1188 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1189 		return NULL;
1190 
1191 	/* the 'triggering' ACK is at the tail of the queue, we have already
1192 	 * returned if it is the only packet in the flow. loop through the rest
1193 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1194 	 * triggering one.
1195 	 */
1196 	for (skb_check = flow->head;
1197 	     skb_check && skb_check != skb;
1198 	     skb_prev = skb_check, skb_check = skb_check->next) {
1199 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1200 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1201 					     sizeof(_tcph_check));
1202 
1203 		/* only TCP packets with matching 5-tuple are eligible, and only
1204 		 * drop safe headers
1205 		 */
1206 		if (!tcph_check || iph->version != iph_check->version ||
1207 		    tcph_check->source != tcph->source ||
1208 		    tcph_check->dest != tcph->dest)
1209 			continue;
1210 
1211 		if (iph_check->version == 4) {
1212 			if (iph_check->saddr != iph->saddr ||
1213 			    iph_check->daddr != iph->daddr)
1214 				continue;
1215 
1216 			seglen = iph_totlen(skb, iph_check) -
1217 				       (4 * iph_check->ihl);
1218 		} else if (iph_check->version == 6) {
1219 			ipv6h = (struct ipv6hdr *)iph;
1220 			ipv6h_check = (struct ipv6hdr *)iph_check;
1221 
1222 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1223 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1224 				continue;
1225 
1226 			seglen = ntohs(ipv6h_check->payload_len);
1227 		} else {
1228 			WARN_ON(1);  /* shouldn't happen */
1229 			continue;
1230 		}
1231 
1232 		/* If the ECE/CWR flags changed from the previous eligible
1233 		 * packet in the same flow, we should no longer be dropping that
1234 		 * previous packet as this would lose information.
1235 		 */
1236 		if (elig_ack && (tcp_flag_word(tcph_check) &
1237 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1238 			elig_ack = NULL;
1239 			elig_ack_prev = NULL;
1240 			num_found--;
1241 		}
1242 
1243 		/* Check TCP options and flags, don't drop ACKs with segment
1244 		 * data, and don't drop ACKs with a higher cumulative ACK
1245 		 * counter than the triggering packet. Check ACK seqno here to
1246 		 * avoid parsing SACK options of packets we are going to exclude
1247 		 * anyway.
1248 		 */
1249 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1250 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1251 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1252 			continue;
1253 
1254 		/* Check SACK options. The triggering packet must SACK more data
1255 		 * than the ACK under consideration, or SACK the same range but
1256 		 * have a larger cumulative ACK counter. The latter is a
1257 		 * pathological case, but is contained in the following check
1258 		 * anyway, just to be safe.
1259 		 */
1260 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1261 
1262 		if (sack_comp < 0 ||
1263 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1264 		     sack_comp == 0))
1265 			continue;
1266 
1267 		/* At this point we have found an eligible pure ACK to drop; if
1268 		 * we are in aggressive mode, we are done. Otherwise, keep
1269 		 * searching unless this is the second eligible ACK we
1270 		 * found.
1271 		 *
1272 		 * Since we want to drop ACK closest to the head of the queue,
1273 		 * save the first eligible ACK we find, even if we need to loop
1274 		 * again.
1275 		 */
1276 		if (!elig_ack) {
1277 			elig_ack = skb_check;
1278 			elig_ack_prev = skb_prev;
1279 			elig_flags = (tcp_flag_word(tcph_check)
1280 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1281 		}
1282 
1283 		if (num_found++ > 0)
1284 			goto found;
1285 	}
1286 
1287 	/* We made it through the queue without finding two eligible ACKs . If
1288 	 * we found a single eligible ACK we can drop it in aggressive mode if
1289 	 * we can guarantee that this does not interfere with ECN flag
1290 	 * information. We ensure this by dropping it only if the enqueued
1291 	 * packet is consecutive with the eligible ACK, and their flags match.
1292 	 */
1293 	if (elig_ack && aggressive && elig_ack->next == skb &&
1294 	    (elig_flags == (tcp_flag_word(tcph) &
1295 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1296 		goto found;
1297 
1298 	return NULL;
1299 
1300 found:
1301 	if (elig_ack_prev)
1302 		elig_ack_prev->next = elig_ack->next;
1303 	else
1304 		flow->head = elig_ack->next;
1305 
1306 	skb_mark_not_on_list(elig_ack);
1307 
1308 	return elig_ack;
1309 }
1310 
cake_ewma(u64 avg,u64 sample,u32 shift)1311 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1312 {
1313 	avg -= avg >> shift;
1314 	avg += sample >> shift;
1315 	return avg;
1316 }
1317 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1318 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1319 {
1320 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1321 		len -= off;
1322 
1323 	if (q->max_netlen < len)
1324 		q->max_netlen = len;
1325 	if (q->min_netlen > len)
1326 		q->min_netlen = len;
1327 
1328 	len += q->rate_overhead;
1329 
1330 	if (len < q->rate_mpu)
1331 		len = q->rate_mpu;
1332 
1333 	if (q->atm_mode == CAKE_ATM_ATM) {
1334 		len += 47;
1335 		len /= 48;
1336 		len *= 53;
1337 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1338 		/* Add one byte per 64 bytes or part thereof.
1339 		 * This is conservative and easier to calculate than the
1340 		 * precise value.
1341 		 */
1342 		len += (len + 63) / 64;
1343 	}
1344 
1345 	if (q->max_adjlen < len)
1346 		q->max_adjlen = len;
1347 	if (q->min_adjlen > len)
1348 		q->min_adjlen = len;
1349 
1350 	return len;
1351 }
1352 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1353 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1354 {
1355 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1356 	unsigned int hdr_len, last_len = 0;
1357 	u32 off = skb_network_offset(skb);
1358 	u32 len = qdisc_pkt_len(skb);
1359 	u16 segs = 1;
1360 
1361 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1362 
1363 	if (!shinfo->gso_size)
1364 		return cake_calc_overhead(q, len, off);
1365 
1366 	/* borrowed from qdisc_pkt_len_init() */
1367 	hdr_len = skb_transport_offset(skb);
1368 
1369 	/* + transport layer */
1370 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1371 						SKB_GSO_TCPV6))) {
1372 		const struct tcphdr *th;
1373 		struct tcphdr _tcphdr;
1374 
1375 		th = skb_header_pointer(skb, hdr_len,
1376 					sizeof(_tcphdr), &_tcphdr);
1377 		if (likely(th))
1378 			hdr_len += __tcp_hdrlen(th);
1379 	} else {
1380 		struct udphdr _udphdr;
1381 
1382 		if (skb_header_pointer(skb, hdr_len,
1383 				       sizeof(_udphdr), &_udphdr))
1384 			hdr_len += sizeof(struct udphdr);
1385 	}
1386 
1387 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1388 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1389 				    shinfo->gso_size);
1390 	else
1391 		segs = shinfo->gso_segs;
1392 
1393 	len = shinfo->gso_size + hdr_len;
1394 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1395 
1396 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1397 		cake_calc_overhead(q, last_len, off));
1398 }
1399 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1400 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1401 {
1402 	struct cake_heap_entry ii = q->overflow_heap[i];
1403 	struct cake_heap_entry jj = q->overflow_heap[j];
1404 
1405 	q->overflow_heap[i] = jj;
1406 	q->overflow_heap[j] = ii;
1407 
1408 	q->tins[ii.t].overflow_idx[ii.b] = j;
1409 	q->tins[jj.t].overflow_idx[jj.b] = i;
1410 }
1411 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1412 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1413 {
1414 	struct cake_heap_entry ii = q->overflow_heap[i];
1415 
1416 	return q->tins[ii.t].backlogs[ii.b];
1417 }
1418 
cake_heapify(struct cake_sched_data * q,u16 i)1419 static void cake_heapify(struct cake_sched_data *q, u16 i)
1420 {
1421 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1422 	u32 mb = cake_heap_get_backlog(q, i);
1423 	u32 m = i;
1424 
1425 	while (m < a) {
1426 		u32 l = m + m + 1;
1427 		u32 r = l + 1;
1428 
1429 		if (l < a) {
1430 			u32 lb = cake_heap_get_backlog(q, l);
1431 
1432 			if (lb > mb) {
1433 				m  = l;
1434 				mb = lb;
1435 			}
1436 		}
1437 
1438 		if (r < a) {
1439 			u32 rb = cake_heap_get_backlog(q, r);
1440 
1441 			if (rb > mb) {
1442 				m  = r;
1443 				mb = rb;
1444 			}
1445 		}
1446 
1447 		if (m != i) {
1448 			cake_heap_swap(q, i, m);
1449 			i = m;
1450 		} else {
1451 			break;
1452 		}
1453 	}
1454 }
1455 
cake_heapify_up(struct cake_sched_data * q,u16 i)1456 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1457 {
1458 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1459 		u16 p = (i - 1) >> 1;
1460 		u32 ib = cake_heap_get_backlog(q, i);
1461 		u32 pb = cake_heap_get_backlog(q, p);
1462 
1463 		if (ib > pb) {
1464 			cake_heap_swap(q, i, p);
1465 			i = p;
1466 		} else {
1467 			break;
1468 		}
1469 	}
1470 }
1471 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1472 static int cake_advance_shaper(struct cake_sched_data *q,
1473 			       struct cake_tin_data *b,
1474 			       struct sk_buff *skb,
1475 			       ktime_t now, bool drop)
1476 {
1477 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1478 
1479 	/* charge packet bandwidth to this tin
1480 	 * and to the global shaper.
1481 	 */
1482 	if (q->rate_ns) {
1483 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1484 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1485 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1486 
1487 		if (ktime_before(b->time_next_packet, now))
1488 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1489 							   tin_dur);
1490 
1491 		else if (ktime_before(b->time_next_packet,
1492 				      ktime_add_ns(now, tin_dur)))
1493 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1494 
1495 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1496 						   global_dur);
1497 		if (!drop)
1498 			q->failsafe_next_packet = \
1499 				ktime_add_ns(q->failsafe_next_packet,
1500 					     failsafe_dur);
1501 	}
1502 	return len;
1503 }
1504 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1505 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1506 {
1507 	struct cake_sched_data *q = qdisc_priv(sch);
1508 	ktime_t now = ktime_get();
1509 	u32 idx = 0, tin = 0, len;
1510 	struct cake_heap_entry qq;
1511 	struct cake_tin_data *b;
1512 	struct cake_flow *flow;
1513 	struct sk_buff *skb;
1514 
1515 	if (!q->overflow_timeout) {
1516 		int i;
1517 		/* Build fresh max-heap */
1518 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1519 			cake_heapify(q, i);
1520 	}
1521 	q->overflow_timeout = 65535;
1522 
1523 	/* select longest queue for pruning */
1524 	qq  = q->overflow_heap[0];
1525 	tin = qq.t;
1526 	idx = qq.b;
1527 
1528 	b = &q->tins[tin];
1529 	flow = &b->flows[idx];
1530 	skb = dequeue_head(flow);
1531 	if (unlikely(!skb)) {
1532 		/* heap has gone wrong, rebuild it next time */
1533 		q->overflow_timeout = 0;
1534 		return idx + (tin << 16);
1535 	}
1536 
1537 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1538 		b->unresponsive_flow_count++;
1539 
1540 	len = qdisc_pkt_len(skb);
1541 	q->buffer_used      -= skb->truesize;
1542 	b->backlogs[idx]    -= len;
1543 	b->tin_backlog      -= len;
1544 	sch->qstats.backlog -= len;
1545 	qdisc_tree_reduce_backlog(sch, 1, len);
1546 
1547 	flow->dropped++;
1548 	b->tin_dropped++;
1549 	sch->qstats.drops++;
1550 
1551 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1552 		cake_advance_shaper(q, b, skb, now, true);
1553 
1554 	__qdisc_drop(skb, to_free);
1555 	sch->q.qlen--;
1556 
1557 	cake_heapify(q, 0);
1558 
1559 	return idx + (tin << 16);
1560 }
1561 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1562 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1563 {
1564 	const int offset = skb_network_offset(skb);
1565 	u16 *buf, buf_;
1566 	u8 dscp;
1567 
1568 	switch (skb_protocol(skb, true)) {
1569 	case htons(ETH_P_IP):
1570 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1571 		if (unlikely(!buf))
1572 			return 0;
1573 
1574 		/* ToS is in the second byte of iphdr */
1575 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1576 
1577 		if (wash && dscp) {
1578 			const int wlen = offset + sizeof(struct iphdr);
1579 
1580 			if (!pskb_may_pull(skb, wlen) ||
1581 			    skb_try_make_writable(skb, wlen))
1582 				return 0;
1583 
1584 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1585 		}
1586 
1587 		return dscp;
1588 
1589 	case htons(ETH_P_IPV6):
1590 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1591 		if (unlikely(!buf))
1592 			return 0;
1593 
1594 		/* Traffic class is in the first and second bytes of ipv6hdr */
1595 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1596 
1597 		if (wash && dscp) {
1598 			const int wlen = offset + sizeof(struct ipv6hdr);
1599 
1600 			if (!pskb_may_pull(skb, wlen) ||
1601 			    skb_try_make_writable(skb, wlen))
1602 				return 0;
1603 
1604 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1605 		}
1606 
1607 		return dscp;
1608 
1609 	case htons(ETH_P_ARP):
1610 		return 0x38;  /* CS7 - Net Control */
1611 
1612 	default:
1613 		/* If there is no Diffserv field, treat as best-effort */
1614 		return 0;
1615 	}
1616 }
1617 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1618 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1619 					     struct sk_buff *skb)
1620 {
1621 	struct cake_sched_data *q = qdisc_priv(sch);
1622 	u32 tin, mark;
1623 	bool wash;
1624 	u8 dscp;
1625 
1626 	/* Tin selection: Default to diffserv-based selection, allow overriding
1627 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1628 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1629 	 */
1630 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1631 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1632 	if (wash)
1633 		dscp = cake_handle_diffserv(skb, wash);
1634 
1635 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1636 		tin = 0;
1637 
1638 	else if (mark && mark <= q->tin_cnt)
1639 		tin = q->tin_order[mark - 1];
1640 
1641 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1642 		 TC_H_MIN(skb->priority) > 0 &&
1643 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1644 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1645 
1646 	else {
1647 		if (!wash)
1648 			dscp = cake_handle_diffserv(skb, wash);
1649 		tin = q->tin_index[dscp];
1650 
1651 		if (unlikely(tin >= q->tin_cnt))
1652 			tin = 0;
1653 	}
1654 
1655 	return &q->tins[tin];
1656 }
1657 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1658 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1659 			 struct sk_buff *skb, int flow_mode, int *qerr)
1660 {
1661 	struct cake_sched_data *q = qdisc_priv(sch);
1662 	struct tcf_proto *filter;
1663 	struct tcf_result res;
1664 	u16 flow = 0, host = 0;
1665 	int result;
1666 
1667 	filter = rcu_dereference_bh(q->filter_list);
1668 	if (!filter)
1669 		goto hash;
1670 
1671 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1672 	result = tcf_classify(skb, NULL, filter, &res, false);
1673 
1674 	if (result >= 0) {
1675 #ifdef CONFIG_NET_CLS_ACT
1676 		switch (result) {
1677 		case TC_ACT_STOLEN:
1678 		case TC_ACT_QUEUED:
1679 		case TC_ACT_TRAP:
1680 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1681 			fallthrough;
1682 		case TC_ACT_SHOT:
1683 			return 0;
1684 		}
1685 #endif
1686 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1687 			flow = TC_H_MIN(res.classid);
1688 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1689 			host = TC_H_MAJ(res.classid) >> 16;
1690 	}
1691 hash:
1692 	*t = cake_select_tin(sch, skb);
1693 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1694 }
1695 
1696 static void cake_reconfigure(struct Qdisc *sch);
1697 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1698 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1699 			struct sk_buff **to_free)
1700 {
1701 	struct cake_sched_data *q = qdisc_priv(sch);
1702 	int len = qdisc_pkt_len(skb);
1703 	int ret;
1704 	struct sk_buff *ack = NULL;
1705 	ktime_t now = ktime_get();
1706 	struct cake_tin_data *b;
1707 	struct cake_flow *flow;
1708 	u32 idx;
1709 
1710 	/* choose flow to insert into */
1711 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1712 	if (idx == 0) {
1713 		if (ret & __NET_XMIT_BYPASS)
1714 			qdisc_qstats_drop(sch);
1715 		__qdisc_drop(skb, to_free);
1716 		return ret;
1717 	}
1718 	idx--;
1719 	flow = &b->flows[idx];
1720 
1721 	/* ensure shaper state isn't stale */
1722 	if (!b->tin_backlog) {
1723 		if (ktime_before(b->time_next_packet, now))
1724 			b->time_next_packet = now;
1725 
1726 		if (!sch->q.qlen) {
1727 			if (ktime_before(q->time_next_packet, now)) {
1728 				q->failsafe_next_packet = now;
1729 				q->time_next_packet = now;
1730 			} else if (ktime_after(q->time_next_packet, now) &&
1731 				   ktime_after(q->failsafe_next_packet, now)) {
1732 				u64 next = \
1733 					min(ktime_to_ns(q->time_next_packet),
1734 					    ktime_to_ns(
1735 						   q->failsafe_next_packet));
1736 				sch->qstats.overlimits++;
1737 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1738 			}
1739 		}
1740 	}
1741 
1742 	if (unlikely(len > b->max_skblen))
1743 		b->max_skblen = len;
1744 
1745 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1746 		struct sk_buff *segs, *nskb;
1747 		netdev_features_t features = netif_skb_features(skb);
1748 		unsigned int slen = 0, numsegs = 0;
1749 
1750 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1751 		if (IS_ERR_OR_NULL(segs))
1752 			return qdisc_drop(skb, sch, to_free);
1753 
1754 		skb_list_walk_safe(segs, segs, nskb) {
1755 			skb_mark_not_on_list(segs);
1756 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1757 			cobalt_set_enqueue_time(segs, now);
1758 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1759 									  segs);
1760 			flow_queue_add(flow, segs);
1761 
1762 			sch->q.qlen++;
1763 			numsegs++;
1764 			slen += segs->len;
1765 			q->buffer_used += segs->truesize;
1766 			b->packets++;
1767 		}
1768 
1769 		/* stats */
1770 		b->bytes	    += slen;
1771 		b->backlogs[idx]    += slen;
1772 		b->tin_backlog      += slen;
1773 		sch->qstats.backlog += slen;
1774 		q->avg_window_bytes += slen;
1775 
1776 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1777 		consume_skb(skb);
1778 	} else {
1779 		/* not splitting */
1780 		cobalt_set_enqueue_time(skb, now);
1781 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1782 		flow_queue_add(flow, skb);
1783 
1784 		if (q->ack_filter)
1785 			ack = cake_ack_filter(q, flow);
1786 
1787 		if (ack) {
1788 			b->ack_drops++;
1789 			sch->qstats.drops++;
1790 			b->bytes += qdisc_pkt_len(ack);
1791 			len -= qdisc_pkt_len(ack);
1792 			q->buffer_used += skb->truesize - ack->truesize;
1793 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1794 				cake_advance_shaper(q, b, ack, now, true);
1795 
1796 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1797 			consume_skb(ack);
1798 		} else {
1799 			sch->q.qlen++;
1800 			q->buffer_used      += skb->truesize;
1801 		}
1802 
1803 		/* stats */
1804 		b->packets++;
1805 		b->bytes	    += len;
1806 		b->backlogs[idx]    += len;
1807 		b->tin_backlog      += len;
1808 		sch->qstats.backlog += len;
1809 		q->avg_window_bytes += len;
1810 	}
1811 
1812 	if (q->overflow_timeout)
1813 		cake_heapify_up(q, b->overflow_idx[idx]);
1814 
1815 	/* incoming bandwidth capacity estimate */
1816 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1817 		u64 packet_interval = \
1818 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1819 
1820 		if (packet_interval > NSEC_PER_SEC)
1821 			packet_interval = NSEC_PER_SEC;
1822 
1823 		/* filter out short-term bursts, eg. wifi aggregation */
1824 		q->avg_packet_interval = \
1825 			cake_ewma(q->avg_packet_interval,
1826 				  packet_interval,
1827 				  (packet_interval > q->avg_packet_interval ?
1828 					  2 : 8));
1829 
1830 		q->last_packet_time = now;
1831 
1832 		if (packet_interval > q->avg_packet_interval) {
1833 			u64 window_interval = \
1834 				ktime_to_ns(ktime_sub(now,
1835 						      q->avg_window_begin));
1836 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1837 
1838 			b = div64_u64(b, window_interval);
1839 			q->avg_peak_bandwidth =
1840 				cake_ewma(q->avg_peak_bandwidth, b,
1841 					  b > q->avg_peak_bandwidth ? 2 : 8);
1842 			q->avg_window_bytes = 0;
1843 			q->avg_window_begin = now;
1844 
1845 			if (ktime_after(now,
1846 					ktime_add_ms(q->last_reconfig_time,
1847 						     250))) {
1848 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1849 				cake_reconfigure(sch);
1850 			}
1851 		}
1852 	} else {
1853 		q->avg_window_bytes = 0;
1854 		q->last_packet_time = now;
1855 	}
1856 
1857 	/* flowchain */
1858 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1859 		struct cake_host *srchost = &b->hosts[flow->srchost];
1860 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1861 		u16 host_load = 1;
1862 
1863 		if (!flow->set) {
1864 			list_add_tail(&flow->flowchain, &b->new_flows);
1865 		} else {
1866 			b->decaying_flow_count--;
1867 			list_move_tail(&flow->flowchain, &b->new_flows);
1868 		}
1869 		flow->set = CAKE_SET_SPARSE;
1870 		b->sparse_flow_count++;
1871 
1872 		if (cake_dsrc(q->flow_mode))
1873 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1874 
1875 		if (cake_ddst(q->flow_mode))
1876 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1877 
1878 		flow->deficit = (b->flow_quantum *
1879 				 quantum_div[host_load]) >> 16;
1880 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1881 		struct cake_host *srchost = &b->hosts[flow->srchost];
1882 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1883 
1884 		/* this flow was empty, accounted as a sparse flow, but actually
1885 		 * in the bulk rotation.
1886 		 */
1887 		flow->set = CAKE_SET_BULK;
1888 		b->sparse_flow_count--;
1889 		b->bulk_flow_count++;
1890 
1891 		if (cake_dsrc(q->flow_mode))
1892 			srchost->srchost_bulk_flow_count++;
1893 
1894 		if (cake_ddst(q->flow_mode))
1895 			dsthost->dsthost_bulk_flow_count++;
1896 
1897 	}
1898 
1899 	if (q->buffer_used > q->buffer_max_used)
1900 		q->buffer_max_used = q->buffer_used;
1901 
1902 	if (q->buffer_used > q->buffer_limit) {
1903 		u32 dropped = 0;
1904 
1905 		while (q->buffer_used > q->buffer_limit) {
1906 			dropped++;
1907 			cake_drop(sch, to_free);
1908 		}
1909 		b->drop_overlimit += dropped;
1910 	}
1911 	return NET_XMIT_SUCCESS;
1912 }
1913 
cake_dequeue_one(struct Qdisc * sch)1914 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1915 {
1916 	struct cake_sched_data *q = qdisc_priv(sch);
1917 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1918 	struct cake_flow *flow = &b->flows[q->cur_flow];
1919 	struct sk_buff *skb = NULL;
1920 	u32 len;
1921 
1922 	if (flow->head) {
1923 		skb = dequeue_head(flow);
1924 		len = qdisc_pkt_len(skb);
1925 		b->backlogs[q->cur_flow] -= len;
1926 		b->tin_backlog		 -= len;
1927 		sch->qstats.backlog      -= len;
1928 		q->buffer_used		 -= skb->truesize;
1929 		sch->q.qlen--;
1930 
1931 		if (q->overflow_timeout)
1932 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1933 	}
1934 	return skb;
1935 }
1936 
1937 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1938 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1939 {
1940 	struct cake_sched_data *q = qdisc_priv(sch);
1941 	struct sk_buff *skb;
1942 
1943 	q->cur_tin = tin;
1944 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1945 		while (!!(skb = cake_dequeue_one(sch)))
1946 			kfree_skb(skb);
1947 }
1948 
cake_dequeue(struct Qdisc * sch)1949 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1950 {
1951 	struct cake_sched_data *q = qdisc_priv(sch);
1952 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1953 	struct cake_host *srchost, *dsthost;
1954 	ktime_t now = ktime_get();
1955 	struct cake_flow *flow;
1956 	struct list_head *head;
1957 	bool first_flow = true;
1958 	struct sk_buff *skb;
1959 	u16 host_load;
1960 	u64 delay;
1961 	u32 len;
1962 
1963 begin:
1964 	if (!sch->q.qlen)
1965 		return NULL;
1966 
1967 	/* global hard shaper */
1968 	if (ktime_after(q->time_next_packet, now) &&
1969 	    ktime_after(q->failsafe_next_packet, now)) {
1970 		u64 next = min(ktime_to_ns(q->time_next_packet),
1971 			       ktime_to_ns(q->failsafe_next_packet));
1972 
1973 		sch->qstats.overlimits++;
1974 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1975 		return NULL;
1976 	}
1977 
1978 	/* Choose a class to work on. */
1979 	if (!q->rate_ns) {
1980 		/* In unlimited mode, can't rely on shaper timings, just balance
1981 		 * with DRR
1982 		 */
1983 		bool wrapped = false, empty = true;
1984 
1985 		while (b->tin_deficit < 0 ||
1986 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1987 			if (b->tin_deficit <= 0)
1988 				b->tin_deficit += b->tin_quantum;
1989 			if (b->sparse_flow_count + b->bulk_flow_count)
1990 				empty = false;
1991 
1992 			q->cur_tin++;
1993 			b++;
1994 			if (q->cur_tin >= q->tin_cnt) {
1995 				q->cur_tin = 0;
1996 				b = q->tins;
1997 
1998 				if (wrapped) {
1999 					/* It's possible for q->qlen to be
2000 					 * nonzero when we actually have no
2001 					 * packets anywhere.
2002 					 */
2003 					if (empty)
2004 						return NULL;
2005 				} else {
2006 					wrapped = true;
2007 				}
2008 			}
2009 		}
2010 	} else {
2011 		/* In shaped mode, choose:
2012 		 * - Highest-priority tin with queue and meeting schedule, or
2013 		 * - The earliest-scheduled tin with queue.
2014 		 */
2015 		ktime_t best_time = KTIME_MAX;
2016 		int tin, best_tin = 0;
2017 
2018 		for (tin = 0; tin < q->tin_cnt; tin++) {
2019 			b = q->tins + tin;
2020 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2021 				ktime_t time_to_pkt = \
2022 					ktime_sub(b->time_next_packet, now);
2023 
2024 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2025 				    ktime_compare(time_to_pkt,
2026 						  best_time) <= 0) {
2027 					best_time = time_to_pkt;
2028 					best_tin = tin;
2029 				}
2030 			}
2031 		}
2032 
2033 		q->cur_tin = best_tin;
2034 		b = q->tins + best_tin;
2035 
2036 		/* No point in going further if no packets to deliver. */
2037 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2038 			return NULL;
2039 	}
2040 
2041 retry:
2042 	/* service this class */
2043 	head = &b->decaying_flows;
2044 	if (!first_flow || list_empty(head)) {
2045 		head = &b->new_flows;
2046 		if (list_empty(head)) {
2047 			head = &b->old_flows;
2048 			if (unlikely(list_empty(head))) {
2049 				head = &b->decaying_flows;
2050 				if (unlikely(list_empty(head)))
2051 					goto begin;
2052 			}
2053 		}
2054 	}
2055 	flow = list_first_entry(head, struct cake_flow, flowchain);
2056 	q->cur_flow = flow - b->flows;
2057 	first_flow = false;
2058 
2059 	/* triple isolation (modified DRR++) */
2060 	srchost = &b->hosts[flow->srchost];
2061 	dsthost = &b->hosts[flow->dsthost];
2062 	host_load = 1;
2063 
2064 	/* flow isolation (DRR++) */
2065 	if (flow->deficit <= 0) {
2066 		/* Keep all flows with deficits out of the sparse and decaying
2067 		 * rotations.  No non-empty flow can go into the decaying
2068 		 * rotation, so they can't get deficits
2069 		 */
2070 		if (flow->set == CAKE_SET_SPARSE) {
2071 			if (flow->head) {
2072 				b->sparse_flow_count--;
2073 				b->bulk_flow_count++;
2074 
2075 				if (cake_dsrc(q->flow_mode))
2076 					srchost->srchost_bulk_flow_count++;
2077 
2078 				if (cake_ddst(q->flow_mode))
2079 					dsthost->dsthost_bulk_flow_count++;
2080 
2081 				flow->set = CAKE_SET_BULK;
2082 			} else {
2083 				/* we've moved it to the bulk rotation for
2084 				 * correct deficit accounting but we still want
2085 				 * to count it as a sparse flow, not a bulk one.
2086 				 */
2087 				flow->set = CAKE_SET_SPARSE_WAIT;
2088 			}
2089 		}
2090 
2091 		if (cake_dsrc(q->flow_mode))
2092 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2093 
2094 		if (cake_ddst(q->flow_mode))
2095 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2096 
2097 		WARN_ON(host_load > CAKE_QUEUES);
2098 
2099 		/* The get_random_u16() is a way to apply dithering to avoid
2100 		 * accumulating roundoff errors
2101 		 */
2102 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2103 				  get_random_u16()) >> 16;
2104 		list_move_tail(&flow->flowchain, &b->old_flows);
2105 
2106 		goto retry;
2107 	}
2108 
2109 	/* Retrieve a packet via the AQM */
2110 	while (1) {
2111 		skb = cake_dequeue_one(sch);
2112 		if (!skb) {
2113 			/* this queue was actually empty */
2114 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2115 				b->unresponsive_flow_count--;
2116 
2117 			if (flow->cvars.p_drop || flow->cvars.count ||
2118 			    ktime_before(now, flow->cvars.drop_next)) {
2119 				/* keep in the flowchain until the state has
2120 				 * decayed to rest
2121 				 */
2122 				list_move_tail(&flow->flowchain,
2123 					       &b->decaying_flows);
2124 				if (flow->set == CAKE_SET_BULK) {
2125 					b->bulk_flow_count--;
2126 
2127 					if (cake_dsrc(q->flow_mode))
2128 						srchost->srchost_bulk_flow_count--;
2129 
2130 					if (cake_ddst(q->flow_mode))
2131 						dsthost->dsthost_bulk_flow_count--;
2132 
2133 					b->decaying_flow_count++;
2134 				} else if (flow->set == CAKE_SET_SPARSE ||
2135 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2136 					b->sparse_flow_count--;
2137 					b->decaying_flow_count++;
2138 				}
2139 				flow->set = CAKE_SET_DECAYING;
2140 			} else {
2141 				/* remove empty queue from the flowchain */
2142 				list_del_init(&flow->flowchain);
2143 				if (flow->set == CAKE_SET_SPARSE ||
2144 				    flow->set == CAKE_SET_SPARSE_WAIT)
2145 					b->sparse_flow_count--;
2146 				else if (flow->set == CAKE_SET_BULK) {
2147 					b->bulk_flow_count--;
2148 
2149 					if (cake_dsrc(q->flow_mode))
2150 						srchost->srchost_bulk_flow_count--;
2151 
2152 					if (cake_ddst(q->flow_mode))
2153 						dsthost->dsthost_bulk_flow_count--;
2154 
2155 				} else
2156 					b->decaying_flow_count--;
2157 
2158 				flow->set = CAKE_SET_NONE;
2159 			}
2160 			goto begin;
2161 		}
2162 
2163 		/* Last packet in queue may be marked, shouldn't be dropped */
2164 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2165 					(b->bulk_flow_count *
2166 					 !!(q->rate_flags &
2167 					    CAKE_FLAG_INGRESS))) ||
2168 		    !flow->head)
2169 			break;
2170 
2171 		/* drop this packet, get another one */
2172 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2173 			len = cake_advance_shaper(q, b, skb,
2174 						  now, true);
2175 			flow->deficit -= len;
2176 			b->tin_deficit -= len;
2177 		}
2178 		flow->dropped++;
2179 		b->tin_dropped++;
2180 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2181 		qdisc_qstats_drop(sch);
2182 		kfree_skb(skb);
2183 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2184 			goto retry;
2185 	}
2186 
2187 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2188 	qdisc_bstats_update(sch, skb);
2189 
2190 	/* collect delay stats */
2191 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2192 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2193 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2194 				  delay > b->peak_delay ? 2 : 8);
2195 	b->base_delay = cake_ewma(b->base_delay, delay,
2196 				  delay < b->base_delay ? 2 : 8);
2197 
2198 	len = cake_advance_shaper(q, b, skb, now, false);
2199 	flow->deficit -= len;
2200 	b->tin_deficit -= len;
2201 
2202 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2203 		u64 next = min(ktime_to_ns(q->time_next_packet),
2204 			       ktime_to_ns(q->failsafe_next_packet));
2205 
2206 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2207 	} else if (!sch->q.qlen) {
2208 		int i;
2209 
2210 		for (i = 0; i < q->tin_cnt; i++) {
2211 			if (q->tins[i].decaying_flow_count) {
2212 				ktime_t next = \
2213 					ktime_add_ns(now,
2214 						     q->tins[i].cparams.target);
2215 
2216 				qdisc_watchdog_schedule_ns(&q->watchdog,
2217 							   ktime_to_ns(next));
2218 				break;
2219 			}
2220 		}
2221 	}
2222 
2223 	if (q->overflow_timeout)
2224 		q->overflow_timeout--;
2225 
2226 	return skb;
2227 }
2228 
cake_reset(struct Qdisc * sch)2229 static void cake_reset(struct Qdisc *sch)
2230 {
2231 	struct cake_sched_data *q = qdisc_priv(sch);
2232 	u32 c;
2233 
2234 	if (!q->tins)
2235 		return;
2236 
2237 	for (c = 0; c < CAKE_MAX_TINS; c++)
2238 		cake_clear_tin(sch, c);
2239 }
2240 
2241 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2242 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2243 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2244 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2245 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2246 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2247 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2248 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2249 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2250 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2251 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2252 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2253 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2254 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2255 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2256 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2257 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2258 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2259 };
2260 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2261 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2262 			  u64 target_ns, u64 rtt_est_ns)
2263 {
2264 	/* convert byte-rate into time-per-byte
2265 	 * so it will always unwedge in reasonable time.
2266 	 */
2267 	static const u64 MIN_RATE = 64;
2268 	u32 byte_target = mtu;
2269 	u64 byte_target_ns;
2270 	u8  rate_shft = 0;
2271 	u64 rate_ns = 0;
2272 
2273 	b->flow_quantum = 1514;
2274 	if (rate) {
2275 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2276 		rate_shft = 34;
2277 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2278 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2279 		while (!!(rate_ns >> 34)) {
2280 			rate_ns >>= 1;
2281 			rate_shft--;
2282 		}
2283 	} /* else unlimited, ie. zero delay */
2284 
2285 	b->tin_rate_bps  = rate;
2286 	b->tin_rate_ns   = rate_ns;
2287 	b->tin_rate_shft = rate_shft;
2288 
2289 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2290 
2291 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2292 	b->cparams.interval = max(rtt_est_ns +
2293 				     b->cparams.target - target_ns,
2294 				     b->cparams.target * 2);
2295 	b->cparams.mtu_time = byte_target_ns;
2296 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2297 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2298 }
2299 
cake_config_besteffort(struct Qdisc * sch)2300 static int cake_config_besteffort(struct Qdisc *sch)
2301 {
2302 	struct cake_sched_data *q = qdisc_priv(sch);
2303 	struct cake_tin_data *b = &q->tins[0];
2304 	u32 mtu = psched_mtu(qdisc_dev(sch));
2305 	u64 rate = q->rate_bps;
2306 
2307 	q->tin_cnt = 1;
2308 
2309 	q->tin_index = besteffort;
2310 	q->tin_order = normal_order;
2311 
2312 	cake_set_rate(b, rate, mtu,
2313 		      us_to_ns(q->target), us_to_ns(q->interval));
2314 	b->tin_quantum = 65535;
2315 
2316 	return 0;
2317 }
2318 
cake_config_precedence(struct Qdisc * sch)2319 static int cake_config_precedence(struct Qdisc *sch)
2320 {
2321 	/* convert high-level (user visible) parameters into internal format */
2322 	struct cake_sched_data *q = qdisc_priv(sch);
2323 	u32 mtu = psched_mtu(qdisc_dev(sch));
2324 	u64 rate = q->rate_bps;
2325 	u32 quantum = 256;
2326 	u32 i;
2327 
2328 	q->tin_cnt = 8;
2329 	q->tin_index = precedence;
2330 	q->tin_order = normal_order;
2331 
2332 	for (i = 0; i < q->tin_cnt; i++) {
2333 		struct cake_tin_data *b = &q->tins[i];
2334 
2335 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2336 			      us_to_ns(q->interval));
2337 
2338 		b->tin_quantum = max_t(u16, 1U, quantum);
2339 
2340 		/* calculate next class's parameters */
2341 		rate  *= 7;
2342 		rate >>= 3;
2343 
2344 		quantum  *= 7;
2345 		quantum >>= 3;
2346 	}
2347 
2348 	return 0;
2349 }
2350 
2351 /*	List of known Diffserv codepoints:
2352  *
2353  *	Default Forwarding (DF/CS0) - Best Effort
2354  *	Max Throughput (TOS2)
2355  *	Min Delay (TOS4)
2356  *	LLT "La" (TOS5)
2357  *	Assured Forwarding 1 (AF1x) - x3
2358  *	Assured Forwarding 2 (AF2x) - x3
2359  *	Assured Forwarding 3 (AF3x) - x3
2360  *	Assured Forwarding 4 (AF4x) - x3
2361  *	Precedence Class 1 (CS1)
2362  *	Precedence Class 2 (CS2)
2363  *	Precedence Class 3 (CS3)
2364  *	Precedence Class 4 (CS4)
2365  *	Precedence Class 5 (CS5)
2366  *	Precedence Class 6 (CS6)
2367  *	Precedence Class 7 (CS7)
2368  *	Voice Admit (VA)
2369  *	Expedited Forwarding (EF)
2370  *	Lower Effort (LE)
2371  *
2372  *	Total 26 codepoints.
2373  */
2374 
2375 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2376  *		(roughly descending order of contended priority)
2377  *		(roughly ascending order of uncontended throughput)
2378  *
2379  *	Network Control (CS6,CS7)      - routing traffic
2380  *	Telephony (EF,VA)         - aka. VoIP streams
2381  *	Signalling (CS5)               - VoIP setup
2382  *	Multimedia Conferencing (AF4x) - aka. video calls
2383  *	Realtime Interactive (CS4)     - eg. games
2384  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2385  *	Broadcast Video (CS3)
2386  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2387  *	Ops, Admin, Management (CS2)      - eg. ssh
2388  *	Standard Service (DF & unrecognised codepoints)
2389  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2390  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2391  *
2392  *	Total 12 traffic classes.
2393  */
2394 
cake_config_diffserv8(struct Qdisc * sch)2395 static int cake_config_diffserv8(struct Qdisc *sch)
2396 {
2397 /*	Pruned list of traffic classes for typical applications:
2398  *
2399  *		Network Control          (CS6, CS7)
2400  *		Minimum Latency          (EF, VA, CS5, CS4)
2401  *		Interactive Shell        (CS2)
2402  *		Low Latency Transactions (AF2x, TOS4)
2403  *		Video Streaming          (AF4x, AF3x, CS3)
2404  *		Bog Standard             (DF etc.)
2405  *		High Throughput          (AF1x, TOS2, CS1)
2406  *		Background Traffic       (LE)
2407  *
2408  *		Total 8 traffic classes.
2409  */
2410 
2411 	struct cake_sched_data *q = qdisc_priv(sch);
2412 	u32 mtu = psched_mtu(qdisc_dev(sch));
2413 	u64 rate = q->rate_bps;
2414 	u32 quantum = 256;
2415 	u32 i;
2416 
2417 	q->tin_cnt = 8;
2418 
2419 	/* codepoint to class mapping */
2420 	q->tin_index = diffserv8;
2421 	q->tin_order = normal_order;
2422 
2423 	/* class characteristics */
2424 	for (i = 0; i < q->tin_cnt; i++) {
2425 		struct cake_tin_data *b = &q->tins[i];
2426 
2427 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2428 			      us_to_ns(q->interval));
2429 
2430 		b->tin_quantum = max_t(u16, 1U, quantum);
2431 
2432 		/* calculate next class's parameters */
2433 		rate  *= 7;
2434 		rate >>= 3;
2435 
2436 		quantum  *= 7;
2437 		quantum >>= 3;
2438 	}
2439 
2440 	return 0;
2441 }
2442 
cake_config_diffserv4(struct Qdisc * sch)2443 static int cake_config_diffserv4(struct Qdisc *sch)
2444 {
2445 /*  Further pruned list of traffic classes for four-class system:
2446  *
2447  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2448  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2449  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2450  *	    Background Traffic (LE, CS1)
2451  *
2452  *		Total 4 traffic classes.
2453  */
2454 
2455 	struct cake_sched_data *q = qdisc_priv(sch);
2456 	u32 mtu = psched_mtu(qdisc_dev(sch));
2457 	u64 rate = q->rate_bps;
2458 	u32 quantum = 1024;
2459 
2460 	q->tin_cnt = 4;
2461 
2462 	/* codepoint to class mapping */
2463 	q->tin_index = diffserv4;
2464 	q->tin_order = bulk_order;
2465 
2466 	/* class characteristics */
2467 	cake_set_rate(&q->tins[0], rate, mtu,
2468 		      us_to_ns(q->target), us_to_ns(q->interval));
2469 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2470 		      us_to_ns(q->target), us_to_ns(q->interval));
2471 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2472 		      us_to_ns(q->target), us_to_ns(q->interval));
2473 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2474 		      us_to_ns(q->target), us_to_ns(q->interval));
2475 
2476 	/* bandwidth-sharing weights */
2477 	q->tins[0].tin_quantum = quantum;
2478 	q->tins[1].tin_quantum = quantum >> 4;
2479 	q->tins[2].tin_quantum = quantum >> 1;
2480 	q->tins[3].tin_quantum = quantum >> 2;
2481 
2482 	return 0;
2483 }
2484 
cake_config_diffserv3(struct Qdisc * sch)2485 static int cake_config_diffserv3(struct Qdisc *sch)
2486 {
2487 /*  Simplified Diffserv structure with 3 tins.
2488  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2489  *		Best Effort
2490  *		Low Priority		(LE, CS1)
2491  */
2492 	struct cake_sched_data *q = qdisc_priv(sch);
2493 	u32 mtu = psched_mtu(qdisc_dev(sch));
2494 	u64 rate = q->rate_bps;
2495 	u32 quantum = 1024;
2496 
2497 	q->tin_cnt = 3;
2498 
2499 	/* codepoint to class mapping */
2500 	q->tin_index = diffserv3;
2501 	q->tin_order = bulk_order;
2502 
2503 	/* class characteristics */
2504 	cake_set_rate(&q->tins[0], rate, mtu,
2505 		      us_to_ns(q->target), us_to_ns(q->interval));
2506 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2507 		      us_to_ns(q->target), us_to_ns(q->interval));
2508 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2509 		      us_to_ns(q->target), us_to_ns(q->interval));
2510 
2511 	/* bandwidth-sharing weights */
2512 	q->tins[0].tin_quantum = quantum;
2513 	q->tins[1].tin_quantum = quantum >> 4;
2514 	q->tins[2].tin_quantum = quantum >> 2;
2515 
2516 	return 0;
2517 }
2518 
cake_reconfigure(struct Qdisc * sch)2519 static void cake_reconfigure(struct Qdisc *sch)
2520 {
2521 	struct cake_sched_data *q = qdisc_priv(sch);
2522 	int c, ft;
2523 
2524 	switch (q->tin_mode) {
2525 	case CAKE_DIFFSERV_BESTEFFORT:
2526 		ft = cake_config_besteffort(sch);
2527 		break;
2528 
2529 	case CAKE_DIFFSERV_PRECEDENCE:
2530 		ft = cake_config_precedence(sch);
2531 		break;
2532 
2533 	case CAKE_DIFFSERV_DIFFSERV8:
2534 		ft = cake_config_diffserv8(sch);
2535 		break;
2536 
2537 	case CAKE_DIFFSERV_DIFFSERV4:
2538 		ft = cake_config_diffserv4(sch);
2539 		break;
2540 
2541 	case CAKE_DIFFSERV_DIFFSERV3:
2542 	default:
2543 		ft = cake_config_diffserv3(sch);
2544 		break;
2545 	}
2546 
2547 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2548 		cake_clear_tin(sch, c);
2549 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2550 	}
2551 
2552 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2553 	q->rate_shft = q->tins[ft].tin_rate_shft;
2554 
2555 	if (q->buffer_config_limit) {
2556 		q->buffer_limit = q->buffer_config_limit;
2557 	} else if (q->rate_bps) {
2558 		u64 t = q->rate_bps * q->interval;
2559 
2560 		do_div(t, USEC_PER_SEC / 4);
2561 		q->buffer_limit = max_t(u32, t, 4U << 20);
2562 	} else {
2563 		q->buffer_limit = ~0;
2564 	}
2565 
2566 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2567 
2568 	q->buffer_limit = min(q->buffer_limit,
2569 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2570 				  q->buffer_config_limit));
2571 }
2572 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2573 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2574 		       struct netlink_ext_ack *extack)
2575 {
2576 	struct cake_sched_data *q = qdisc_priv(sch);
2577 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2578 	int err;
2579 
2580 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2581 					  extack);
2582 	if (err < 0)
2583 		return err;
2584 
2585 	if (tb[TCA_CAKE_NAT]) {
2586 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2587 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2588 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2589 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2590 #else
2591 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2592 				    "No conntrack support in kernel");
2593 		return -EOPNOTSUPP;
2594 #endif
2595 	}
2596 
2597 	if (tb[TCA_CAKE_BASE_RATE64])
2598 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2599 
2600 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2601 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2602 
2603 	if (tb[TCA_CAKE_WASH]) {
2604 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2605 			q->rate_flags |= CAKE_FLAG_WASH;
2606 		else
2607 			q->rate_flags &= ~CAKE_FLAG_WASH;
2608 	}
2609 
2610 	if (tb[TCA_CAKE_FLOW_MODE])
2611 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2612 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2613 					CAKE_FLOW_MASK));
2614 
2615 	if (tb[TCA_CAKE_ATM])
2616 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2617 
2618 	if (tb[TCA_CAKE_OVERHEAD]) {
2619 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2620 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2621 
2622 		q->max_netlen = 0;
2623 		q->max_adjlen = 0;
2624 		q->min_netlen = ~0;
2625 		q->min_adjlen = ~0;
2626 	}
2627 
2628 	if (tb[TCA_CAKE_RAW]) {
2629 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2630 
2631 		q->max_netlen = 0;
2632 		q->max_adjlen = 0;
2633 		q->min_netlen = ~0;
2634 		q->min_adjlen = ~0;
2635 	}
2636 
2637 	if (tb[TCA_CAKE_MPU])
2638 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2639 
2640 	if (tb[TCA_CAKE_RTT]) {
2641 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2642 
2643 		if (!q->interval)
2644 			q->interval = 1;
2645 	}
2646 
2647 	if (tb[TCA_CAKE_TARGET]) {
2648 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2649 
2650 		if (!q->target)
2651 			q->target = 1;
2652 	}
2653 
2654 	if (tb[TCA_CAKE_AUTORATE]) {
2655 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2656 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2657 		else
2658 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2659 	}
2660 
2661 	if (tb[TCA_CAKE_INGRESS]) {
2662 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2663 			q->rate_flags |= CAKE_FLAG_INGRESS;
2664 		else
2665 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2666 	}
2667 
2668 	if (tb[TCA_CAKE_ACK_FILTER])
2669 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2670 
2671 	if (tb[TCA_CAKE_MEMORY])
2672 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2673 
2674 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2675 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2676 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2677 		else
2678 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2679 	}
2680 
2681 	if (tb[TCA_CAKE_FWMARK]) {
2682 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2683 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2684 	}
2685 
2686 	if (q->tins) {
2687 		sch_tree_lock(sch);
2688 		cake_reconfigure(sch);
2689 		sch_tree_unlock(sch);
2690 	}
2691 
2692 	return 0;
2693 }
2694 
cake_destroy(struct Qdisc * sch)2695 static void cake_destroy(struct Qdisc *sch)
2696 {
2697 	struct cake_sched_data *q = qdisc_priv(sch);
2698 
2699 	qdisc_watchdog_cancel(&q->watchdog);
2700 	tcf_block_put(q->block);
2701 	kvfree(q->tins);
2702 }
2703 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2704 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2705 		     struct netlink_ext_ack *extack)
2706 {
2707 	struct cake_sched_data *q = qdisc_priv(sch);
2708 	int i, j, err;
2709 
2710 	sch->limit = 10240;
2711 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2712 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2713 
2714 	q->rate_bps = 0; /* unlimited by default */
2715 
2716 	q->interval = 100000; /* 100ms default */
2717 	q->target   =   5000; /* 5ms: codel RFC argues
2718 			       * for 5 to 10% of interval
2719 			       */
2720 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2721 	q->cur_tin = 0;
2722 	q->cur_flow  = 0;
2723 
2724 	qdisc_watchdog_init(&q->watchdog, sch);
2725 
2726 	if (opt) {
2727 		err = cake_change(sch, opt, extack);
2728 
2729 		if (err)
2730 			return err;
2731 	}
2732 
2733 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2734 	if (err)
2735 		return err;
2736 
2737 	quantum_div[0] = ~0;
2738 	for (i = 1; i <= CAKE_QUEUES; i++)
2739 		quantum_div[i] = 65535 / i;
2740 
2741 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2742 			   GFP_KERNEL);
2743 	if (!q->tins)
2744 		return -ENOMEM;
2745 
2746 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2747 		struct cake_tin_data *b = q->tins + i;
2748 
2749 		INIT_LIST_HEAD(&b->new_flows);
2750 		INIT_LIST_HEAD(&b->old_flows);
2751 		INIT_LIST_HEAD(&b->decaying_flows);
2752 		b->sparse_flow_count = 0;
2753 		b->bulk_flow_count = 0;
2754 		b->decaying_flow_count = 0;
2755 
2756 		for (j = 0; j < CAKE_QUEUES; j++) {
2757 			struct cake_flow *flow = b->flows + j;
2758 			u32 k = j * CAKE_MAX_TINS + i;
2759 
2760 			INIT_LIST_HEAD(&flow->flowchain);
2761 			cobalt_vars_init(&flow->cvars);
2762 
2763 			q->overflow_heap[k].t = i;
2764 			q->overflow_heap[k].b = j;
2765 			b->overflow_idx[j] = k;
2766 		}
2767 	}
2768 
2769 	cake_reconfigure(sch);
2770 	q->avg_peak_bandwidth = q->rate_bps;
2771 	q->min_netlen = ~0;
2772 	q->min_adjlen = ~0;
2773 	return 0;
2774 }
2775 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2776 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2777 {
2778 	struct cake_sched_data *q = qdisc_priv(sch);
2779 	struct nlattr *opts;
2780 
2781 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2782 	if (!opts)
2783 		goto nla_put_failure;
2784 
2785 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2786 			      TCA_CAKE_PAD))
2787 		goto nla_put_failure;
2788 
2789 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2790 			q->flow_mode & CAKE_FLOW_MASK))
2791 		goto nla_put_failure;
2792 
2793 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2794 		goto nla_put_failure;
2795 
2796 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2797 		goto nla_put_failure;
2798 
2799 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2800 		goto nla_put_failure;
2801 
2802 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2803 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2804 		goto nla_put_failure;
2805 
2806 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2807 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2808 		goto nla_put_failure;
2809 
2810 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2811 		goto nla_put_failure;
2812 
2813 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2814 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2815 		goto nla_put_failure;
2816 
2817 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2818 		goto nla_put_failure;
2819 
2820 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2821 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2822 		goto nla_put_failure;
2823 
2824 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2825 		goto nla_put_failure;
2826 
2827 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2828 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2829 			goto nla_put_failure;
2830 
2831 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2832 		goto nla_put_failure;
2833 
2834 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2835 		goto nla_put_failure;
2836 
2837 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2838 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2839 		goto nla_put_failure;
2840 
2841 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2842 		goto nla_put_failure;
2843 
2844 	return nla_nest_end(skb, opts);
2845 
2846 nla_put_failure:
2847 	return -1;
2848 }
2849 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2850 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2851 {
2852 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2853 	struct cake_sched_data *q = qdisc_priv(sch);
2854 	struct nlattr *tstats, *ts;
2855 	int i;
2856 
2857 	if (!stats)
2858 		return -1;
2859 
2860 #define PUT_STAT_U32(attr, data) do {				       \
2861 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2862 			goto nla_put_failure;			       \
2863 	} while (0)
2864 #define PUT_STAT_U64(attr, data) do {				       \
2865 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2866 					data, TCA_CAKE_STATS_PAD)) \
2867 			goto nla_put_failure;			       \
2868 	} while (0)
2869 
2870 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2871 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2872 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2873 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2874 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2875 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2876 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2877 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2878 
2879 #undef PUT_STAT_U32
2880 #undef PUT_STAT_U64
2881 
2882 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2883 	if (!tstats)
2884 		goto nla_put_failure;
2885 
2886 #define PUT_TSTAT_U32(attr, data) do {					\
2887 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2888 			goto nla_put_failure;				\
2889 	} while (0)
2890 #define PUT_TSTAT_U64(attr, data) do {					\
2891 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2892 					data, TCA_CAKE_TIN_STATS_PAD))	\
2893 			goto nla_put_failure;				\
2894 	} while (0)
2895 
2896 	for (i = 0; i < q->tin_cnt; i++) {
2897 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2898 
2899 		ts = nla_nest_start_noflag(d->skb, i + 1);
2900 		if (!ts)
2901 			goto nla_put_failure;
2902 
2903 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2904 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2905 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2906 
2907 		PUT_TSTAT_U32(TARGET_US,
2908 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2909 		PUT_TSTAT_U32(INTERVAL_US,
2910 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2911 
2912 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2913 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2914 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2915 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2916 
2917 		PUT_TSTAT_U32(PEAK_DELAY_US,
2918 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2919 		PUT_TSTAT_U32(AVG_DELAY_US,
2920 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2921 		PUT_TSTAT_U32(BASE_DELAY_US,
2922 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2923 
2924 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2925 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2926 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2927 
2928 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2929 					    b->decaying_flow_count);
2930 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2931 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2932 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2933 
2934 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2935 		nla_nest_end(d->skb, ts);
2936 	}
2937 
2938 #undef PUT_TSTAT_U32
2939 #undef PUT_TSTAT_U64
2940 
2941 	nla_nest_end(d->skb, tstats);
2942 	return nla_nest_end(d->skb, stats);
2943 
2944 nla_put_failure:
2945 	nla_nest_cancel(d->skb, stats);
2946 	return -1;
2947 }
2948 
cake_leaf(struct Qdisc * sch,unsigned long arg)2949 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2950 {
2951 	return NULL;
2952 }
2953 
cake_find(struct Qdisc * sch,u32 classid)2954 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2955 {
2956 	return 0;
2957 }
2958 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2959 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2960 			       u32 classid)
2961 {
2962 	return 0;
2963 }
2964 
cake_unbind(struct Qdisc * q,unsigned long cl)2965 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2966 {
2967 }
2968 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2969 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2970 					struct netlink_ext_ack *extack)
2971 {
2972 	struct cake_sched_data *q = qdisc_priv(sch);
2973 
2974 	if (cl)
2975 		return NULL;
2976 	return q->block;
2977 }
2978 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2979 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2980 			   struct sk_buff *skb, struct tcmsg *tcm)
2981 {
2982 	tcm->tcm_handle |= TC_H_MIN(cl);
2983 	return 0;
2984 }
2985 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2986 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2987 				 struct gnet_dump *d)
2988 {
2989 	struct cake_sched_data *q = qdisc_priv(sch);
2990 	const struct cake_flow *flow = NULL;
2991 	struct gnet_stats_queue qs = { 0 };
2992 	struct nlattr *stats;
2993 	u32 idx = cl - 1;
2994 
2995 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2996 		const struct cake_tin_data *b = \
2997 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2998 		const struct sk_buff *skb;
2999 
3000 		flow = &b->flows[idx % CAKE_QUEUES];
3001 
3002 		if (flow->head) {
3003 			sch_tree_lock(sch);
3004 			skb = flow->head;
3005 			while (skb) {
3006 				qs.qlen++;
3007 				skb = skb->next;
3008 			}
3009 			sch_tree_unlock(sch);
3010 		}
3011 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3012 		qs.drops = flow->dropped;
3013 	}
3014 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3015 		return -1;
3016 	if (flow) {
3017 		ktime_t now = ktime_get();
3018 
3019 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3020 		if (!stats)
3021 			return -1;
3022 
3023 #define PUT_STAT_U32(attr, data) do {				       \
3024 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3025 			goto nla_put_failure;			       \
3026 	} while (0)
3027 #define PUT_STAT_S32(attr, data) do {				       \
3028 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3029 			goto nla_put_failure;			       \
3030 	} while (0)
3031 
3032 		PUT_STAT_S32(DEFICIT, flow->deficit);
3033 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3034 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3035 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3036 		if (flow->cvars.p_drop) {
3037 			PUT_STAT_S32(BLUE_TIMER_US,
3038 				     ktime_to_us(
3039 					     ktime_sub(now,
3040 						       flow->cvars.blue_timer)));
3041 		}
3042 		if (flow->cvars.dropping) {
3043 			PUT_STAT_S32(DROP_NEXT_US,
3044 				     ktime_to_us(
3045 					     ktime_sub(now,
3046 						       flow->cvars.drop_next)));
3047 		}
3048 
3049 		if (nla_nest_end(d->skb, stats) < 0)
3050 			return -1;
3051 	}
3052 
3053 	return 0;
3054 
3055 nla_put_failure:
3056 	nla_nest_cancel(d->skb, stats);
3057 	return -1;
3058 }
3059 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3060 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3061 {
3062 	struct cake_sched_data *q = qdisc_priv(sch);
3063 	unsigned int i, j;
3064 
3065 	if (arg->stop)
3066 		return;
3067 
3068 	for (i = 0; i < q->tin_cnt; i++) {
3069 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3070 
3071 		for (j = 0; j < CAKE_QUEUES; j++) {
3072 			if (list_empty(&b->flows[j].flowchain)) {
3073 				arg->count++;
3074 				continue;
3075 			}
3076 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3077 						 arg))
3078 				break;
3079 		}
3080 	}
3081 }
3082 
3083 static const struct Qdisc_class_ops cake_class_ops = {
3084 	.leaf		=	cake_leaf,
3085 	.find		=	cake_find,
3086 	.tcf_block	=	cake_tcf_block,
3087 	.bind_tcf	=	cake_bind,
3088 	.unbind_tcf	=	cake_unbind,
3089 	.dump		=	cake_dump_class,
3090 	.dump_stats	=	cake_dump_class_stats,
3091 	.walk		=	cake_walk,
3092 };
3093 
3094 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3095 	.cl_ops		=	&cake_class_ops,
3096 	.id		=	"cake",
3097 	.priv_size	=	sizeof(struct cake_sched_data),
3098 	.enqueue	=	cake_enqueue,
3099 	.dequeue	=	cake_dequeue,
3100 	.peek		=	qdisc_peek_dequeued,
3101 	.init		=	cake_init,
3102 	.reset		=	cake_reset,
3103 	.destroy	=	cake_destroy,
3104 	.change		=	cake_change,
3105 	.dump		=	cake_dump,
3106 	.dump_stats	=	cake_dump_stats,
3107 	.owner		=	THIS_MODULE,
3108 };
3109 
cake_module_init(void)3110 static int __init cake_module_init(void)
3111 {
3112 	return register_qdisc(&cake_qdisc_ops);
3113 }
3114 
cake_module_exit(void)3115 static void __exit cake_module_exit(void)
3116 {
3117 	unregister_qdisc(&cake_qdisc_ops);
3118 }
3119 
3120 module_init(cake_module_init)
3121 module_exit(cake_module_exit)
3122 MODULE_AUTHOR("Jonathan Morton");
3123 MODULE_LICENSE("Dual BSD/GPL");
3124 MODULE_DESCRIPTION("The CAKE shaper.");
3125