xref: /openbmc/linux/net/sched/sch_cake.c (revision d457a0e3)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 #define REC_INV_SQRT_CACHE (16)
365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366 
367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369  *
370  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371  */
372 
cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars)
374 {
375 	u32 invsqrt, invsqrt2;
376 	u64 val;
377 
378 	invsqrt = vars->rec_inv_sqrt;
379 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381 
382 	val >>= 2; /* avoid overflow in following multiply */
383 	val = (val * invsqrt) >> (32 - 2 + 1);
384 
385 	vars->rec_inv_sqrt = val;
386 }
387 
cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars)
389 {
390 	if (vars->count < REC_INV_SQRT_CACHE)
391 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392 	else
393 		cobalt_newton_step(vars);
394 }
395 
396 /* There is a big difference in timing between the accurate values placed in
397  * the cache and the approximations given by a single Newton step for small
398  * count values, particularly when stepping from count 1 to 2 or vice versa.
399  * Above 16, a single Newton step gives sufficient accuracy in either
400  * direction, given the precision stored.
401  *
402  * The magnitude of the error when stepping up to count 2 is such as to give
403  * the value that *should* have been produced at count 4.
404  */
405 
cobalt_cache_init(void)406 static void cobalt_cache_init(void)
407 {
408 	struct cobalt_vars v;
409 
410 	memset(&v, 0, sizeof(v));
411 	v.rec_inv_sqrt = ~0U;
412 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413 
414 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 		cobalt_newton_step(&v);
419 
420 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421 	}
422 }
423 
cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 	memset(vars, 0, sizeof(*vars));
427 
428 	if (!cobalt_rec_inv_sqrt_cache[0]) {
429 		cobalt_cache_init();
430 		cobalt_rec_inv_sqrt_cache[0] = ~0;
431 	}
432 }
433 
434 /* CoDel control_law is t + interval/sqrt(count)
435  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436  * both sqrt() and divide operation.
437  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t,
439 			      u64 interval,
440 			      u32 rec_inv_sqrt)
441 {
442 	return ktime_add_ns(t, reciprocal_scale(interval,
443 						rec_inv_sqrt));
444 }
445 
446 /* Call this when a packet had to be dropped due to queue overflow.  Returns
447  * true if the BLUE state was quiescent before but active after this call.
448  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars,
450 			      struct cobalt_params *p,
451 			      ktime_t now)
452 {
453 	bool up = false;
454 
455 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456 		up = !vars->p_drop;
457 		vars->p_drop += p->p_inc;
458 		if (vars->p_drop < p->p_inc)
459 			vars->p_drop = ~0;
460 		vars->blue_timer = now;
461 	}
462 	vars->dropping = true;
463 	vars->drop_next = now;
464 	if (!vars->count)
465 		vars->count = 1;
466 
467 	return up;
468 }
469 
470 /* Call this when the queue was serviced but turned out to be empty.  Returns
471  * true if the BLUE state was active before but quiescent after this call.
472  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
474 			       struct cobalt_params *p,
475 			       ktime_t now)
476 {
477 	bool down = false;
478 
479 	if (vars->p_drop &&
480 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481 		if (vars->p_drop < p->p_dec)
482 			vars->p_drop = 0;
483 		else
484 			vars->p_drop -= p->p_dec;
485 		vars->blue_timer = now;
486 		down = !vars->p_drop;
487 	}
488 	vars->dropping = false;
489 
490 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491 		vars->count--;
492 		cobalt_invsqrt(vars);
493 		vars->drop_next = cobalt_control(vars->drop_next,
494 						 p->interval,
495 						 vars->rec_inv_sqrt);
496 	}
497 
498 	return down;
499 }
500 
501 /* Call this with a freshly dequeued packet for possible congestion marking.
502  * Returns true as an instruction to drop the packet, false for delivery.
503  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars,
505 			       struct cobalt_params *p,
506 			       ktime_t now,
507 			       struct sk_buff *skb,
508 			       u32 bulk_flows)
509 {
510 	bool next_due, over_target, drop = false;
511 	ktime_t schedule;
512 	u64 sojourn;
513 
514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
515  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516  * scheduling decision is actually branched, without destroying that
517  * information.  Similarly, the first 'schedule' value calculated is preserved
518  * in the boolean 'next_due'.
519  *
520  * As for 'drop_next', we take advantage of the fact that 'interval' is both
521  * the delay between first exceeding 'target' and the first signalling event,
522  * *and* the scaling factor for the signalling frequency.  It's therefore very
523  * natural to use a single mechanism for both purposes, and eliminates a
524  * significant amount of reference Codel's spaghetti code.  To help with this,
525  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526  * as possible to 1.0 in fixed-point.
527  */
528 
529 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530 	schedule = ktime_sub(now, vars->drop_next);
531 	over_target = sojourn > p->target &&
532 		      sojourn > p->mtu_time * bulk_flows * 2 &&
533 		      sojourn > p->mtu_time * 4;
534 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535 
536 	vars->ecn_marked = false;
537 
538 	if (over_target) {
539 		if (!vars->dropping) {
540 			vars->dropping = true;
541 			vars->drop_next = cobalt_control(now,
542 							 p->interval,
543 							 vars->rec_inv_sqrt);
544 		}
545 		if (!vars->count)
546 			vars->count = 1;
547 	} else if (vars->dropping) {
548 		vars->dropping = false;
549 	}
550 
551 	if (next_due && vars->dropping) {
552 		/* Use ECN mark if possible, otherwise drop */
553 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554 
555 		vars->count++;
556 		if (!vars->count)
557 			vars->count--;
558 		cobalt_invsqrt(vars);
559 		vars->drop_next = cobalt_control(vars->drop_next,
560 						 p->interval,
561 						 vars->rec_inv_sqrt);
562 		schedule = ktime_sub(now, vars->drop_next);
563 	} else {
564 		while (next_due) {
565 			vars->count--;
566 			cobalt_invsqrt(vars);
567 			vars->drop_next = cobalt_control(vars->drop_next,
568 							 p->interval,
569 							 vars->rec_inv_sqrt);
570 			schedule = ktime_sub(now, vars->drop_next);
571 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572 		}
573 	}
574 
575 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576 	if (vars->p_drop)
577 		drop |= (get_random_u32() < vars->p_drop);
578 
579 	/* Overload the drop_next field as an activity timeout */
580 	if (!vars->count)
581 		vars->drop_next = ktime_add_ns(now, p->interval);
582 	else if (ktime_to_ns(schedule) > 0 && !drop)
583 		vars->drop_next = now;
584 
585 	return drop;
586 }
587 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static bool cake_update_flowkeys(struct flow_keys *keys,
589 				 const struct sk_buff *skb)
590 {
591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592 	struct nf_conntrack_tuple tuple = {};
593 	bool rev = !skb->_nfct, upd = false;
594 	__be32 ip;
595 
596 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
597 		return false;
598 
599 	if (!nf_ct_get_tuple_skb(&tuple, skb))
600 		return false;
601 
602 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603 	if (ip != keys->addrs.v4addrs.src) {
604 		keys->addrs.v4addrs.src = ip;
605 		upd = true;
606 	}
607 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608 	if (ip != keys->addrs.v4addrs.dst) {
609 		keys->addrs.v4addrs.dst = ip;
610 		upd = true;
611 	}
612 
613 	if (keys->ports.ports) {
614 		__be16 port;
615 
616 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
617 		if (port != keys->ports.src) {
618 			keys->ports.src = port;
619 			upd = true;
620 		}
621 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
622 		if (port != keys->ports.dst) {
623 			port = keys->ports.dst;
624 			upd = true;
625 		}
626 	}
627 	return upd;
628 #else
629 	return false;
630 #endif
631 }
632 
633 /* Cake has several subtle multiple bit settings. In these cases you
634  *  would be matching triple isolate mode as well.
635  */
636 
cake_dsrc(int flow_mode)637 static bool cake_dsrc(int flow_mode)
638 {
639 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640 }
641 
cake_ddst(int flow_mode)642 static bool cake_ddst(int flow_mode)
643 {
644 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645 }
646 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)647 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
648 		     int flow_mode, u16 flow_override, u16 host_override)
649 {
650 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
651 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
652 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
653 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
654 	u16 reduced_hash, srchost_idx, dsthost_idx;
655 	struct flow_keys keys, host_keys;
656 	bool use_skbhash = skb->l4_hash;
657 
658 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
659 		return 0;
660 
661 	/* If both overrides are set, or we can use the SKB hash and nat mode is
662 	 * disabled, we can skip packet dissection entirely. If nat mode is
663 	 * enabled there's another check below after doing the conntrack lookup.
664 	 */
665 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
666 		goto skip_hash;
667 
668 	skb_flow_dissect_flow_keys(skb, &keys,
669 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
670 
671 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
672 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
673 		use_skbhash = false;
674 
675 	/* If we can still use the SKB hash and don't need the host hash, we can
676 	 * skip the rest of the hashing procedure
677 	 */
678 	if (use_skbhash && !hash_hosts)
679 		goto skip_hash;
680 
681 	/* flow_hash_from_keys() sorts the addresses by value, so we have
682 	 * to preserve their order in a separate data structure to treat
683 	 * src and dst host addresses as independently selectable.
684 	 */
685 	host_keys = keys;
686 	host_keys.ports.ports     = 0;
687 	host_keys.basic.ip_proto  = 0;
688 	host_keys.keyid.keyid     = 0;
689 	host_keys.tags.flow_label = 0;
690 
691 	switch (host_keys.control.addr_type) {
692 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
693 		host_keys.addrs.v4addrs.src = 0;
694 		dsthost_hash = flow_hash_from_keys(&host_keys);
695 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
696 		host_keys.addrs.v4addrs.dst = 0;
697 		srchost_hash = flow_hash_from_keys(&host_keys);
698 		break;
699 
700 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
701 		memset(&host_keys.addrs.v6addrs.src, 0,
702 		       sizeof(host_keys.addrs.v6addrs.src));
703 		dsthost_hash = flow_hash_from_keys(&host_keys);
704 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
705 		memset(&host_keys.addrs.v6addrs.dst, 0,
706 		       sizeof(host_keys.addrs.v6addrs.dst));
707 		srchost_hash = flow_hash_from_keys(&host_keys);
708 		break;
709 
710 	default:
711 		dsthost_hash = 0;
712 		srchost_hash = 0;
713 	}
714 
715 	/* This *must* be after the above switch, since as a
716 	 * side-effect it sorts the src and dst addresses.
717 	 */
718 	if (hash_flows && !use_skbhash)
719 		flow_hash = flow_hash_from_keys(&keys);
720 
721 skip_hash:
722 	if (flow_override)
723 		flow_hash = flow_override - 1;
724 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
725 		flow_hash = skb->hash;
726 	if (host_override) {
727 		dsthost_hash = host_override - 1;
728 		srchost_hash = host_override - 1;
729 	}
730 
731 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
732 		if (flow_mode & CAKE_FLOW_SRC_IP)
733 			flow_hash ^= srchost_hash;
734 
735 		if (flow_mode & CAKE_FLOW_DST_IP)
736 			flow_hash ^= dsthost_hash;
737 	}
738 
739 	reduced_hash = flow_hash % CAKE_QUEUES;
740 
741 	/* set-associative hashing */
742 	/* fast path if no hash collision (direct lookup succeeds) */
743 	if (likely(q->tags[reduced_hash] == flow_hash &&
744 		   q->flows[reduced_hash].set)) {
745 		q->way_directs++;
746 	} else {
747 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
748 		u32 outer_hash = reduced_hash - inner_hash;
749 		bool allocate_src = false;
750 		bool allocate_dst = false;
751 		u32 i, k;
752 
753 		/* check if any active queue in the set is reserved for
754 		 * this flow.
755 		 */
756 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
757 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
758 			if (q->tags[outer_hash + k] == flow_hash) {
759 				if (i)
760 					q->way_hits++;
761 
762 				if (!q->flows[outer_hash + k].set) {
763 					/* need to increment host refcnts */
764 					allocate_src = cake_dsrc(flow_mode);
765 					allocate_dst = cake_ddst(flow_mode);
766 				}
767 
768 				goto found;
769 			}
770 		}
771 
772 		/* no queue is reserved for this flow, look for an
773 		 * empty one.
774 		 */
775 		for (i = 0; i < CAKE_SET_WAYS;
776 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
777 			if (!q->flows[outer_hash + k].set) {
778 				q->way_misses++;
779 				allocate_src = cake_dsrc(flow_mode);
780 				allocate_dst = cake_ddst(flow_mode);
781 				goto found;
782 			}
783 		}
784 
785 		/* With no empty queues, default to the original
786 		 * queue, accept the collision, update the host tags.
787 		 */
788 		q->way_collisions++;
789 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
790 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
791 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
792 		}
793 		allocate_src = cake_dsrc(flow_mode);
794 		allocate_dst = cake_ddst(flow_mode);
795 found:
796 		/* reserve queue for future packets in same flow */
797 		reduced_hash = outer_hash + k;
798 		q->tags[reduced_hash] = flow_hash;
799 
800 		if (allocate_src) {
801 			srchost_idx = srchost_hash % CAKE_QUEUES;
802 			inner_hash = srchost_idx % CAKE_SET_WAYS;
803 			outer_hash = srchost_idx - inner_hash;
804 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
805 				i++, k = (k + 1) % CAKE_SET_WAYS) {
806 				if (q->hosts[outer_hash + k].srchost_tag ==
807 				    srchost_hash)
808 					goto found_src;
809 			}
810 			for (i = 0; i < CAKE_SET_WAYS;
811 				i++, k = (k + 1) % CAKE_SET_WAYS) {
812 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
813 					break;
814 			}
815 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
816 found_src:
817 			srchost_idx = outer_hash + k;
818 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
819 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
820 			q->flows[reduced_hash].srchost = srchost_idx;
821 		}
822 
823 		if (allocate_dst) {
824 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
825 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
826 			outer_hash = dsthost_idx - inner_hash;
827 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
828 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
829 				if (q->hosts[outer_hash + k].dsthost_tag ==
830 				    dsthost_hash)
831 					goto found_dst;
832 			}
833 			for (i = 0; i < CAKE_SET_WAYS;
834 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
835 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
836 					break;
837 			}
838 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
839 found_dst:
840 			dsthost_idx = outer_hash + k;
841 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
842 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
843 			q->flows[reduced_hash].dsthost = dsthost_idx;
844 		}
845 	}
846 
847 	return reduced_hash;
848 }
849 
850 /* helper functions : might be changed when/if skb use a standard list_head */
851 /* remove one skb from head of slot queue */
852 
dequeue_head(struct cake_flow * flow)853 static struct sk_buff *dequeue_head(struct cake_flow *flow)
854 {
855 	struct sk_buff *skb = flow->head;
856 
857 	if (skb) {
858 		flow->head = skb->next;
859 		skb_mark_not_on_list(skb);
860 	}
861 
862 	return skb;
863 }
864 
865 /* add skb to flow queue (tail add) */
866 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)867 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
868 {
869 	if (!flow->head)
870 		flow->head = skb;
871 	else
872 		flow->tail->next = skb;
873 	flow->tail = skb;
874 	skb->next = NULL;
875 }
876 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)877 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
878 				    struct ipv6hdr *buf)
879 {
880 	unsigned int offset = skb_network_offset(skb);
881 	struct iphdr *iph;
882 
883 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
884 
885 	if (!iph)
886 		return NULL;
887 
888 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
889 		return skb_header_pointer(skb, offset + iph->ihl * 4,
890 					  sizeof(struct ipv6hdr), buf);
891 
892 	else if (iph->version == 4)
893 		return iph;
894 
895 	else if (iph->version == 6)
896 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
897 					  buf);
898 
899 	return NULL;
900 }
901 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)902 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
903 				      void *buf, unsigned int bufsize)
904 {
905 	unsigned int offset = skb_network_offset(skb);
906 	const struct ipv6hdr *ipv6h;
907 	const struct tcphdr *tcph;
908 	const struct iphdr *iph;
909 	struct ipv6hdr _ipv6h;
910 	struct tcphdr _tcph;
911 
912 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
913 
914 	if (!ipv6h)
915 		return NULL;
916 
917 	if (ipv6h->version == 4) {
918 		iph = (struct iphdr *)ipv6h;
919 		offset += iph->ihl * 4;
920 
921 		/* special-case 6in4 tunnelling, as that is a common way to get
922 		 * v6 connectivity in the home
923 		 */
924 		if (iph->protocol == IPPROTO_IPV6) {
925 			ipv6h = skb_header_pointer(skb, offset,
926 						   sizeof(_ipv6h), &_ipv6h);
927 
928 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
929 				return NULL;
930 
931 			offset += sizeof(struct ipv6hdr);
932 
933 		} else if (iph->protocol != IPPROTO_TCP) {
934 			return NULL;
935 		}
936 
937 	} else if (ipv6h->version == 6) {
938 		if (ipv6h->nexthdr != IPPROTO_TCP)
939 			return NULL;
940 
941 		offset += sizeof(struct ipv6hdr);
942 	} else {
943 		return NULL;
944 	}
945 
946 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
947 	if (!tcph || tcph->doff < 5)
948 		return NULL;
949 
950 	return skb_header_pointer(skb, offset,
951 				  min(__tcp_hdrlen(tcph), bufsize), buf);
952 }
953 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)954 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
955 				   int code, int *oplen)
956 {
957 	/* inspired by tcp_parse_options in tcp_input.c */
958 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
959 	const u8 *ptr = (const u8 *)(tcph + 1);
960 
961 	while (length > 0) {
962 		int opcode = *ptr++;
963 		int opsize;
964 
965 		if (opcode == TCPOPT_EOL)
966 			break;
967 		if (opcode == TCPOPT_NOP) {
968 			length--;
969 			continue;
970 		}
971 		if (length < 2)
972 			break;
973 		opsize = *ptr++;
974 		if (opsize < 2 || opsize > length)
975 			break;
976 
977 		if (opcode == code) {
978 			*oplen = opsize;
979 			return ptr;
980 		}
981 
982 		ptr += opsize - 2;
983 		length -= opsize;
984 	}
985 
986 	return NULL;
987 }
988 
989 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
990  * bytes than the other. In the case where both sequences ACKs bytes that the
991  * other doesn't, A is considered greater. DSACKs in A also makes A be
992  * considered greater.
993  *
994  * @return -1, 0 or 1 as normal compare functions
995  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)996 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
997 				  const struct tcphdr *tcph_b)
998 {
999 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1000 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1001 	u32 bytes_a = 0, bytes_b = 0;
1002 	int oplen_a, oplen_b;
1003 	bool first = true;
1004 
1005 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1006 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1007 
1008 	/* pointers point to option contents */
1009 	oplen_a -= TCPOLEN_SACK_BASE;
1010 	oplen_b -= TCPOLEN_SACK_BASE;
1011 
1012 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1013 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1014 		return -1;
1015 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1016 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1017 		return 1;
1018 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1019 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1020 		return 0;
1021 
1022 	while (oplen_a >= sizeof(*sack_a)) {
1023 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1024 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1025 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1026 		int oplen_tmp = oplen_b;
1027 		bool found = false;
1028 
1029 		/* DSACK; always considered greater to prevent dropping */
1030 		if (before(start_a, ack_seq_a))
1031 			return -1;
1032 
1033 		bytes_a += end_a - start_a;
1034 
1035 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1036 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1037 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1038 
1039 			/* first time through we count the total size */
1040 			if (first)
1041 				bytes_b += end_b - start_b;
1042 
1043 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1044 				found = true;
1045 				if (!first)
1046 					break;
1047 			}
1048 			oplen_tmp -= sizeof(*sack_tmp);
1049 			sack_tmp++;
1050 		}
1051 
1052 		if (!found)
1053 			return -1;
1054 
1055 		oplen_a -= sizeof(*sack_a);
1056 		sack_a++;
1057 		first = false;
1058 	}
1059 
1060 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1061 	 * either the SACKs are equal, or B SACKs more bytes.
1062 	 */
1063 	return bytes_b > bytes_a ? 1 : 0;
1064 }
1065 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1066 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1067 				 u32 *tsval, u32 *tsecr)
1068 {
1069 	const u8 *ptr;
1070 	int opsize;
1071 
1072 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1073 
1074 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1075 		*tsval = get_unaligned_be32(ptr);
1076 		*tsecr = get_unaligned_be32(ptr + 4);
1077 	}
1078 }
1079 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1080 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1081 			       u32 tstamp_new, u32 tsecr_new)
1082 {
1083 	/* inspired by tcp_parse_options in tcp_input.c */
1084 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1085 	const u8 *ptr = (const u8 *)(tcph + 1);
1086 	u32 tstamp, tsecr;
1087 
1088 	/* 3 reserved flags must be unset to avoid future breakage
1089 	 * ACK must be set
1090 	 * ECE/CWR are handled separately
1091 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1092 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1093 	 * 0x00C00000 = CWR/ECE (handled separately)
1094 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1095 	 */
1096 	if (((tcp_flag_word(tcph) &
1097 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1098 		return false;
1099 
1100 	while (length > 0) {
1101 		int opcode = *ptr++;
1102 		int opsize;
1103 
1104 		if (opcode == TCPOPT_EOL)
1105 			break;
1106 		if (opcode == TCPOPT_NOP) {
1107 			length--;
1108 			continue;
1109 		}
1110 		if (length < 2)
1111 			break;
1112 		opsize = *ptr++;
1113 		if (opsize < 2 || opsize > length)
1114 			break;
1115 
1116 		switch (opcode) {
1117 		case TCPOPT_MD5SIG: /* doesn't influence state */
1118 			break;
1119 
1120 		case TCPOPT_SACK: /* stricter checking performed later */
1121 			if (opsize % 8 != 2)
1122 				return false;
1123 			break;
1124 
1125 		case TCPOPT_TIMESTAMP:
1126 			/* only drop timestamps lower than new */
1127 			if (opsize != TCPOLEN_TIMESTAMP)
1128 				return false;
1129 			tstamp = get_unaligned_be32(ptr);
1130 			tsecr = get_unaligned_be32(ptr + 4);
1131 			if (after(tstamp, tstamp_new) ||
1132 			    after(tsecr, tsecr_new))
1133 				return false;
1134 			break;
1135 
1136 		case TCPOPT_MSS:  /* these should only be set on SYN */
1137 		case TCPOPT_WINDOW:
1138 		case TCPOPT_SACK_PERM:
1139 		case TCPOPT_FASTOPEN:
1140 		case TCPOPT_EXP:
1141 		default: /* don't drop if any unknown options are present */
1142 			return false;
1143 		}
1144 
1145 		ptr += opsize - 2;
1146 		length -= opsize;
1147 	}
1148 
1149 	return true;
1150 }
1151 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1152 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1153 				       struct cake_flow *flow)
1154 {
1155 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1156 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1157 	struct sk_buff *skb_check, *skb_prev = NULL;
1158 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1159 	unsigned char _tcph[64], _tcph_check[64];
1160 	const struct tcphdr *tcph, *tcph_check;
1161 	const struct iphdr *iph, *iph_check;
1162 	struct ipv6hdr _iph, _iph_check;
1163 	const struct sk_buff *skb;
1164 	int seglen, num_found = 0;
1165 	u32 tstamp = 0, tsecr = 0;
1166 	__be32 elig_flags = 0;
1167 	int sack_comp;
1168 
1169 	/* no other possible ACKs to filter */
1170 	if (flow->head == flow->tail)
1171 		return NULL;
1172 
1173 	skb = flow->tail;
1174 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1175 	iph = cake_get_iphdr(skb, &_iph);
1176 	if (!tcph)
1177 		return NULL;
1178 
1179 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1180 
1181 	/* the 'triggering' packet need only have the ACK flag set.
1182 	 * also check that SYN is not set, as there won't be any previous ACKs.
1183 	 */
1184 	if ((tcp_flag_word(tcph) &
1185 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1186 		return NULL;
1187 
1188 	/* the 'triggering' ACK is at the tail of the queue, we have already
1189 	 * returned if it is the only packet in the flow. loop through the rest
1190 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1191 	 * triggering one.
1192 	 */
1193 	for (skb_check = flow->head;
1194 	     skb_check && skb_check != skb;
1195 	     skb_prev = skb_check, skb_check = skb_check->next) {
1196 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1197 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1198 					     sizeof(_tcph_check));
1199 
1200 		/* only TCP packets with matching 5-tuple are eligible, and only
1201 		 * drop safe headers
1202 		 */
1203 		if (!tcph_check || iph->version != iph_check->version ||
1204 		    tcph_check->source != tcph->source ||
1205 		    tcph_check->dest != tcph->dest)
1206 			continue;
1207 
1208 		if (iph_check->version == 4) {
1209 			if (iph_check->saddr != iph->saddr ||
1210 			    iph_check->daddr != iph->daddr)
1211 				continue;
1212 
1213 			seglen = iph_totlen(skb, iph_check) -
1214 				       (4 * iph_check->ihl);
1215 		} else if (iph_check->version == 6) {
1216 			ipv6h = (struct ipv6hdr *)iph;
1217 			ipv6h_check = (struct ipv6hdr *)iph_check;
1218 
1219 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1220 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1221 				continue;
1222 
1223 			seglen = ntohs(ipv6h_check->payload_len);
1224 		} else {
1225 			WARN_ON(1);  /* shouldn't happen */
1226 			continue;
1227 		}
1228 
1229 		/* If the ECE/CWR flags changed from the previous eligible
1230 		 * packet in the same flow, we should no longer be dropping that
1231 		 * previous packet as this would lose information.
1232 		 */
1233 		if (elig_ack && (tcp_flag_word(tcph_check) &
1234 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1235 			elig_ack = NULL;
1236 			elig_ack_prev = NULL;
1237 			num_found--;
1238 		}
1239 
1240 		/* Check TCP options and flags, don't drop ACKs with segment
1241 		 * data, and don't drop ACKs with a higher cumulative ACK
1242 		 * counter than the triggering packet. Check ACK seqno here to
1243 		 * avoid parsing SACK options of packets we are going to exclude
1244 		 * anyway.
1245 		 */
1246 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1247 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1248 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1249 			continue;
1250 
1251 		/* Check SACK options. The triggering packet must SACK more data
1252 		 * than the ACK under consideration, or SACK the same range but
1253 		 * have a larger cumulative ACK counter. The latter is a
1254 		 * pathological case, but is contained in the following check
1255 		 * anyway, just to be safe.
1256 		 */
1257 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1258 
1259 		if (sack_comp < 0 ||
1260 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1261 		     sack_comp == 0))
1262 			continue;
1263 
1264 		/* At this point we have found an eligible pure ACK to drop; if
1265 		 * we are in aggressive mode, we are done. Otherwise, keep
1266 		 * searching unless this is the second eligible ACK we
1267 		 * found.
1268 		 *
1269 		 * Since we want to drop ACK closest to the head of the queue,
1270 		 * save the first eligible ACK we find, even if we need to loop
1271 		 * again.
1272 		 */
1273 		if (!elig_ack) {
1274 			elig_ack = skb_check;
1275 			elig_ack_prev = skb_prev;
1276 			elig_flags = (tcp_flag_word(tcph_check)
1277 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1278 		}
1279 
1280 		if (num_found++ > 0)
1281 			goto found;
1282 	}
1283 
1284 	/* We made it through the queue without finding two eligible ACKs . If
1285 	 * we found a single eligible ACK we can drop it in aggressive mode if
1286 	 * we can guarantee that this does not interfere with ECN flag
1287 	 * information. We ensure this by dropping it only if the enqueued
1288 	 * packet is consecutive with the eligible ACK, and their flags match.
1289 	 */
1290 	if (elig_ack && aggressive && elig_ack->next == skb &&
1291 	    (elig_flags == (tcp_flag_word(tcph) &
1292 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1293 		goto found;
1294 
1295 	return NULL;
1296 
1297 found:
1298 	if (elig_ack_prev)
1299 		elig_ack_prev->next = elig_ack->next;
1300 	else
1301 		flow->head = elig_ack->next;
1302 
1303 	skb_mark_not_on_list(elig_ack);
1304 
1305 	return elig_ack;
1306 }
1307 
cake_ewma(u64 avg,u64 sample,u32 shift)1308 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1309 {
1310 	avg -= avg >> shift;
1311 	avg += sample >> shift;
1312 	return avg;
1313 }
1314 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1315 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1316 {
1317 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1318 		len -= off;
1319 
1320 	if (q->max_netlen < len)
1321 		q->max_netlen = len;
1322 	if (q->min_netlen > len)
1323 		q->min_netlen = len;
1324 
1325 	len += q->rate_overhead;
1326 
1327 	if (len < q->rate_mpu)
1328 		len = q->rate_mpu;
1329 
1330 	if (q->atm_mode == CAKE_ATM_ATM) {
1331 		len += 47;
1332 		len /= 48;
1333 		len *= 53;
1334 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1335 		/* Add one byte per 64 bytes or part thereof.
1336 		 * This is conservative and easier to calculate than the
1337 		 * precise value.
1338 		 */
1339 		len += (len + 63) / 64;
1340 	}
1341 
1342 	if (q->max_adjlen < len)
1343 		q->max_adjlen = len;
1344 	if (q->min_adjlen > len)
1345 		q->min_adjlen = len;
1346 
1347 	return len;
1348 }
1349 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1350 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1351 {
1352 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1353 	unsigned int hdr_len, last_len = 0;
1354 	u32 off = skb_network_offset(skb);
1355 	u32 len = qdisc_pkt_len(skb);
1356 	u16 segs = 1;
1357 
1358 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1359 
1360 	if (!shinfo->gso_size)
1361 		return cake_calc_overhead(q, len, off);
1362 
1363 	/* borrowed from qdisc_pkt_len_init() */
1364 	hdr_len = skb_transport_offset(skb);
1365 
1366 	/* + transport layer */
1367 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1368 						SKB_GSO_TCPV6))) {
1369 		const struct tcphdr *th;
1370 		struct tcphdr _tcphdr;
1371 
1372 		th = skb_header_pointer(skb, hdr_len,
1373 					sizeof(_tcphdr), &_tcphdr);
1374 		if (likely(th))
1375 			hdr_len += __tcp_hdrlen(th);
1376 	} else {
1377 		struct udphdr _udphdr;
1378 
1379 		if (skb_header_pointer(skb, hdr_len,
1380 				       sizeof(_udphdr), &_udphdr))
1381 			hdr_len += sizeof(struct udphdr);
1382 	}
1383 
1384 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1385 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1386 				    shinfo->gso_size);
1387 	else
1388 		segs = shinfo->gso_segs;
1389 
1390 	len = shinfo->gso_size + hdr_len;
1391 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1392 
1393 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1394 		cake_calc_overhead(q, last_len, off));
1395 }
1396 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1397 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1398 {
1399 	struct cake_heap_entry ii = q->overflow_heap[i];
1400 	struct cake_heap_entry jj = q->overflow_heap[j];
1401 
1402 	q->overflow_heap[i] = jj;
1403 	q->overflow_heap[j] = ii;
1404 
1405 	q->tins[ii.t].overflow_idx[ii.b] = j;
1406 	q->tins[jj.t].overflow_idx[jj.b] = i;
1407 }
1408 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1409 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1410 {
1411 	struct cake_heap_entry ii = q->overflow_heap[i];
1412 
1413 	return q->tins[ii.t].backlogs[ii.b];
1414 }
1415 
cake_heapify(struct cake_sched_data * q,u16 i)1416 static void cake_heapify(struct cake_sched_data *q, u16 i)
1417 {
1418 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1419 	u32 mb = cake_heap_get_backlog(q, i);
1420 	u32 m = i;
1421 
1422 	while (m < a) {
1423 		u32 l = m + m + 1;
1424 		u32 r = l + 1;
1425 
1426 		if (l < a) {
1427 			u32 lb = cake_heap_get_backlog(q, l);
1428 
1429 			if (lb > mb) {
1430 				m  = l;
1431 				mb = lb;
1432 			}
1433 		}
1434 
1435 		if (r < a) {
1436 			u32 rb = cake_heap_get_backlog(q, r);
1437 
1438 			if (rb > mb) {
1439 				m  = r;
1440 				mb = rb;
1441 			}
1442 		}
1443 
1444 		if (m != i) {
1445 			cake_heap_swap(q, i, m);
1446 			i = m;
1447 		} else {
1448 			break;
1449 		}
1450 	}
1451 }
1452 
cake_heapify_up(struct cake_sched_data * q,u16 i)1453 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1454 {
1455 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1456 		u16 p = (i - 1) >> 1;
1457 		u32 ib = cake_heap_get_backlog(q, i);
1458 		u32 pb = cake_heap_get_backlog(q, p);
1459 
1460 		if (ib > pb) {
1461 			cake_heap_swap(q, i, p);
1462 			i = p;
1463 		} else {
1464 			break;
1465 		}
1466 	}
1467 }
1468 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1469 static int cake_advance_shaper(struct cake_sched_data *q,
1470 			       struct cake_tin_data *b,
1471 			       struct sk_buff *skb,
1472 			       ktime_t now, bool drop)
1473 {
1474 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1475 
1476 	/* charge packet bandwidth to this tin
1477 	 * and to the global shaper.
1478 	 */
1479 	if (q->rate_ns) {
1480 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1481 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1482 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1483 
1484 		if (ktime_before(b->time_next_packet, now))
1485 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1486 							   tin_dur);
1487 
1488 		else if (ktime_before(b->time_next_packet,
1489 				      ktime_add_ns(now, tin_dur)))
1490 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1491 
1492 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1493 						   global_dur);
1494 		if (!drop)
1495 			q->failsafe_next_packet = \
1496 				ktime_add_ns(q->failsafe_next_packet,
1497 					     failsafe_dur);
1498 	}
1499 	return len;
1500 }
1501 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1502 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1503 {
1504 	struct cake_sched_data *q = qdisc_priv(sch);
1505 	ktime_t now = ktime_get();
1506 	u32 idx = 0, tin = 0, len;
1507 	struct cake_heap_entry qq;
1508 	struct cake_tin_data *b;
1509 	struct cake_flow *flow;
1510 	struct sk_buff *skb;
1511 
1512 	if (!q->overflow_timeout) {
1513 		int i;
1514 		/* Build fresh max-heap */
1515 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1516 			cake_heapify(q, i);
1517 	}
1518 	q->overflow_timeout = 65535;
1519 
1520 	/* select longest queue for pruning */
1521 	qq  = q->overflow_heap[0];
1522 	tin = qq.t;
1523 	idx = qq.b;
1524 
1525 	b = &q->tins[tin];
1526 	flow = &b->flows[idx];
1527 	skb = dequeue_head(flow);
1528 	if (unlikely(!skb)) {
1529 		/* heap has gone wrong, rebuild it next time */
1530 		q->overflow_timeout = 0;
1531 		return idx + (tin << 16);
1532 	}
1533 
1534 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1535 		b->unresponsive_flow_count++;
1536 
1537 	len = qdisc_pkt_len(skb);
1538 	q->buffer_used      -= skb->truesize;
1539 	b->backlogs[idx]    -= len;
1540 	b->tin_backlog      -= len;
1541 	sch->qstats.backlog -= len;
1542 	qdisc_tree_reduce_backlog(sch, 1, len);
1543 
1544 	flow->dropped++;
1545 	b->tin_dropped++;
1546 	sch->qstats.drops++;
1547 
1548 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1549 		cake_advance_shaper(q, b, skb, now, true);
1550 
1551 	__qdisc_drop(skb, to_free);
1552 	sch->q.qlen--;
1553 
1554 	cake_heapify(q, 0);
1555 
1556 	return idx + (tin << 16);
1557 }
1558 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1559 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1560 {
1561 	const int offset = skb_network_offset(skb);
1562 	u16 *buf, buf_;
1563 	u8 dscp;
1564 
1565 	switch (skb_protocol(skb, true)) {
1566 	case htons(ETH_P_IP):
1567 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1568 		if (unlikely(!buf))
1569 			return 0;
1570 
1571 		/* ToS is in the second byte of iphdr */
1572 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1573 
1574 		if (wash && dscp) {
1575 			const int wlen = offset + sizeof(struct iphdr);
1576 
1577 			if (!pskb_may_pull(skb, wlen) ||
1578 			    skb_try_make_writable(skb, wlen))
1579 				return 0;
1580 
1581 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1582 		}
1583 
1584 		return dscp;
1585 
1586 	case htons(ETH_P_IPV6):
1587 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1588 		if (unlikely(!buf))
1589 			return 0;
1590 
1591 		/* Traffic class is in the first and second bytes of ipv6hdr */
1592 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1593 
1594 		if (wash && dscp) {
1595 			const int wlen = offset + sizeof(struct ipv6hdr);
1596 
1597 			if (!pskb_may_pull(skb, wlen) ||
1598 			    skb_try_make_writable(skb, wlen))
1599 				return 0;
1600 
1601 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1602 		}
1603 
1604 		return dscp;
1605 
1606 	case htons(ETH_P_ARP):
1607 		return 0x38;  /* CS7 - Net Control */
1608 
1609 	default:
1610 		/* If there is no Diffserv field, treat as best-effort */
1611 		return 0;
1612 	}
1613 }
1614 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1615 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1616 					     struct sk_buff *skb)
1617 {
1618 	struct cake_sched_data *q = qdisc_priv(sch);
1619 	u32 tin, mark;
1620 	bool wash;
1621 	u8 dscp;
1622 
1623 	/* Tin selection: Default to diffserv-based selection, allow overriding
1624 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1625 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1626 	 */
1627 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1628 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1629 	if (wash)
1630 		dscp = cake_handle_diffserv(skb, wash);
1631 
1632 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1633 		tin = 0;
1634 
1635 	else if (mark && mark <= q->tin_cnt)
1636 		tin = q->tin_order[mark - 1];
1637 
1638 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1639 		 TC_H_MIN(skb->priority) > 0 &&
1640 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1641 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1642 
1643 	else {
1644 		if (!wash)
1645 			dscp = cake_handle_diffserv(skb, wash);
1646 		tin = q->tin_index[dscp];
1647 
1648 		if (unlikely(tin >= q->tin_cnt))
1649 			tin = 0;
1650 	}
1651 
1652 	return &q->tins[tin];
1653 }
1654 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1655 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1656 			 struct sk_buff *skb, int flow_mode, int *qerr)
1657 {
1658 	struct cake_sched_data *q = qdisc_priv(sch);
1659 	struct tcf_proto *filter;
1660 	struct tcf_result res;
1661 	u16 flow = 0, host = 0;
1662 	int result;
1663 
1664 	filter = rcu_dereference_bh(q->filter_list);
1665 	if (!filter)
1666 		goto hash;
1667 
1668 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1669 	result = tcf_classify(skb, NULL, filter, &res, false);
1670 
1671 	if (result >= 0) {
1672 #ifdef CONFIG_NET_CLS_ACT
1673 		switch (result) {
1674 		case TC_ACT_STOLEN:
1675 		case TC_ACT_QUEUED:
1676 		case TC_ACT_TRAP:
1677 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1678 			fallthrough;
1679 		case TC_ACT_SHOT:
1680 			return 0;
1681 		}
1682 #endif
1683 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1684 			flow = TC_H_MIN(res.classid);
1685 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1686 			host = TC_H_MAJ(res.classid) >> 16;
1687 	}
1688 hash:
1689 	*t = cake_select_tin(sch, skb);
1690 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1691 }
1692 
1693 static void cake_reconfigure(struct Qdisc *sch);
1694 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1695 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1696 			struct sk_buff **to_free)
1697 {
1698 	struct cake_sched_data *q = qdisc_priv(sch);
1699 	int len = qdisc_pkt_len(skb);
1700 	int ret;
1701 	struct sk_buff *ack = NULL;
1702 	ktime_t now = ktime_get();
1703 	struct cake_tin_data *b;
1704 	struct cake_flow *flow;
1705 	u32 idx;
1706 
1707 	/* choose flow to insert into */
1708 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1709 	if (idx == 0) {
1710 		if (ret & __NET_XMIT_BYPASS)
1711 			qdisc_qstats_drop(sch);
1712 		__qdisc_drop(skb, to_free);
1713 		return ret;
1714 	}
1715 	idx--;
1716 	flow = &b->flows[idx];
1717 
1718 	/* ensure shaper state isn't stale */
1719 	if (!b->tin_backlog) {
1720 		if (ktime_before(b->time_next_packet, now))
1721 			b->time_next_packet = now;
1722 
1723 		if (!sch->q.qlen) {
1724 			if (ktime_before(q->time_next_packet, now)) {
1725 				q->failsafe_next_packet = now;
1726 				q->time_next_packet = now;
1727 			} else if (ktime_after(q->time_next_packet, now) &&
1728 				   ktime_after(q->failsafe_next_packet, now)) {
1729 				u64 next = \
1730 					min(ktime_to_ns(q->time_next_packet),
1731 					    ktime_to_ns(
1732 						   q->failsafe_next_packet));
1733 				sch->qstats.overlimits++;
1734 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1735 			}
1736 		}
1737 	}
1738 
1739 	if (unlikely(len > b->max_skblen))
1740 		b->max_skblen = len;
1741 
1742 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1743 		struct sk_buff *segs, *nskb;
1744 		netdev_features_t features = netif_skb_features(skb);
1745 		unsigned int slen = 0, numsegs = 0;
1746 
1747 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1748 		if (IS_ERR_OR_NULL(segs))
1749 			return qdisc_drop(skb, sch, to_free);
1750 
1751 		skb_list_walk_safe(segs, segs, nskb) {
1752 			skb_mark_not_on_list(segs);
1753 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1754 			cobalt_set_enqueue_time(segs, now);
1755 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1756 									  segs);
1757 			flow_queue_add(flow, segs);
1758 
1759 			sch->q.qlen++;
1760 			numsegs++;
1761 			slen += segs->len;
1762 			q->buffer_used += segs->truesize;
1763 			b->packets++;
1764 		}
1765 
1766 		/* stats */
1767 		b->bytes	    += slen;
1768 		b->backlogs[idx]    += slen;
1769 		b->tin_backlog      += slen;
1770 		sch->qstats.backlog += slen;
1771 		q->avg_window_bytes += slen;
1772 
1773 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1774 		consume_skb(skb);
1775 	} else {
1776 		/* not splitting */
1777 		cobalt_set_enqueue_time(skb, now);
1778 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1779 		flow_queue_add(flow, skb);
1780 
1781 		if (q->ack_filter)
1782 			ack = cake_ack_filter(q, flow);
1783 
1784 		if (ack) {
1785 			b->ack_drops++;
1786 			sch->qstats.drops++;
1787 			b->bytes += qdisc_pkt_len(ack);
1788 			len -= qdisc_pkt_len(ack);
1789 			q->buffer_used += skb->truesize - ack->truesize;
1790 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1791 				cake_advance_shaper(q, b, ack, now, true);
1792 
1793 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1794 			consume_skb(ack);
1795 		} else {
1796 			sch->q.qlen++;
1797 			q->buffer_used      += skb->truesize;
1798 		}
1799 
1800 		/* stats */
1801 		b->packets++;
1802 		b->bytes	    += len;
1803 		b->backlogs[idx]    += len;
1804 		b->tin_backlog      += len;
1805 		sch->qstats.backlog += len;
1806 		q->avg_window_bytes += len;
1807 	}
1808 
1809 	if (q->overflow_timeout)
1810 		cake_heapify_up(q, b->overflow_idx[idx]);
1811 
1812 	/* incoming bandwidth capacity estimate */
1813 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1814 		u64 packet_interval = \
1815 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1816 
1817 		if (packet_interval > NSEC_PER_SEC)
1818 			packet_interval = NSEC_PER_SEC;
1819 
1820 		/* filter out short-term bursts, eg. wifi aggregation */
1821 		q->avg_packet_interval = \
1822 			cake_ewma(q->avg_packet_interval,
1823 				  packet_interval,
1824 				  (packet_interval > q->avg_packet_interval ?
1825 					  2 : 8));
1826 
1827 		q->last_packet_time = now;
1828 
1829 		if (packet_interval > q->avg_packet_interval) {
1830 			u64 window_interval = \
1831 				ktime_to_ns(ktime_sub(now,
1832 						      q->avg_window_begin));
1833 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1834 
1835 			b = div64_u64(b, window_interval);
1836 			q->avg_peak_bandwidth =
1837 				cake_ewma(q->avg_peak_bandwidth, b,
1838 					  b > q->avg_peak_bandwidth ? 2 : 8);
1839 			q->avg_window_bytes = 0;
1840 			q->avg_window_begin = now;
1841 
1842 			if (ktime_after(now,
1843 					ktime_add_ms(q->last_reconfig_time,
1844 						     250))) {
1845 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1846 				cake_reconfigure(sch);
1847 			}
1848 		}
1849 	} else {
1850 		q->avg_window_bytes = 0;
1851 		q->last_packet_time = now;
1852 	}
1853 
1854 	/* flowchain */
1855 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1856 		struct cake_host *srchost = &b->hosts[flow->srchost];
1857 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1858 		u16 host_load = 1;
1859 
1860 		if (!flow->set) {
1861 			list_add_tail(&flow->flowchain, &b->new_flows);
1862 		} else {
1863 			b->decaying_flow_count--;
1864 			list_move_tail(&flow->flowchain, &b->new_flows);
1865 		}
1866 		flow->set = CAKE_SET_SPARSE;
1867 		b->sparse_flow_count++;
1868 
1869 		if (cake_dsrc(q->flow_mode))
1870 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1871 
1872 		if (cake_ddst(q->flow_mode))
1873 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1874 
1875 		flow->deficit = (b->flow_quantum *
1876 				 quantum_div[host_load]) >> 16;
1877 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1878 		struct cake_host *srchost = &b->hosts[flow->srchost];
1879 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1880 
1881 		/* this flow was empty, accounted as a sparse flow, but actually
1882 		 * in the bulk rotation.
1883 		 */
1884 		flow->set = CAKE_SET_BULK;
1885 		b->sparse_flow_count--;
1886 		b->bulk_flow_count++;
1887 
1888 		if (cake_dsrc(q->flow_mode))
1889 			srchost->srchost_bulk_flow_count++;
1890 
1891 		if (cake_ddst(q->flow_mode))
1892 			dsthost->dsthost_bulk_flow_count++;
1893 
1894 	}
1895 
1896 	if (q->buffer_used > q->buffer_max_used)
1897 		q->buffer_max_used = q->buffer_used;
1898 
1899 	if (q->buffer_used > q->buffer_limit) {
1900 		u32 dropped = 0;
1901 
1902 		while (q->buffer_used > q->buffer_limit) {
1903 			dropped++;
1904 			cake_drop(sch, to_free);
1905 		}
1906 		b->drop_overlimit += dropped;
1907 	}
1908 	return NET_XMIT_SUCCESS;
1909 }
1910 
cake_dequeue_one(struct Qdisc * sch)1911 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1912 {
1913 	struct cake_sched_data *q = qdisc_priv(sch);
1914 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1915 	struct cake_flow *flow = &b->flows[q->cur_flow];
1916 	struct sk_buff *skb = NULL;
1917 	u32 len;
1918 
1919 	if (flow->head) {
1920 		skb = dequeue_head(flow);
1921 		len = qdisc_pkt_len(skb);
1922 		b->backlogs[q->cur_flow] -= len;
1923 		b->tin_backlog		 -= len;
1924 		sch->qstats.backlog      -= len;
1925 		q->buffer_used		 -= skb->truesize;
1926 		sch->q.qlen--;
1927 
1928 		if (q->overflow_timeout)
1929 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1930 	}
1931 	return skb;
1932 }
1933 
1934 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1935 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1936 {
1937 	struct cake_sched_data *q = qdisc_priv(sch);
1938 	struct sk_buff *skb;
1939 
1940 	q->cur_tin = tin;
1941 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1942 		while (!!(skb = cake_dequeue_one(sch)))
1943 			kfree_skb(skb);
1944 }
1945 
cake_dequeue(struct Qdisc * sch)1946 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1947 {
1948 	struct cake_sched_data *q = qdisc_priv(sch);
1949 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1950 	struct cake_host *srchost, *dsthost;
1951 	ktime_t now = ktime_get();
1952 	struct cake_flow *flow;
1953 	struct list_head *head;
1954 	bool first_flow = true;
1955 	struct sk_buff *skb;
1956 	u16 host_load;
1957 	u64 delay;
1958 	u32 len;
1959 
1960 begin:
1961 	if (!sch->q.qlen)
1962 		return NULL;
1963 
1964 	/* global hard shaper */
1965 	if (ktime_after(q->time_next_packet, now) &&
1966 	    ktime_after(q->failsafe_next_packet, now)) {
1967 		u64 next = min(ktime_to_ns(q->time_next_packet),
1968 			       ktime_to_ns(q->failsafe_next_packet));
1969 
1970 		sch->qstats.overlimits++;
1971 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1972 		return NULL;
1973 	}
1974 
1975 	/* Choose a class to work on. */
1976 	if (!q->rate_ns) {
1977 		/* In unlimited mode, can't rely on shaper timings, just balance
1978 		 * with DRR
1979 		 */
1980 		bool wrapped = false, empty = true;
1981 
1982 		while (b->tin_deficit < 0 ||
1983 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1984 			if (b->tin_deficit <= 0)
1985 				b->tin_deficit += b->tin_quantum;
1986 			if (b->sparse_flow_count + b->bulk_flow_count)
1987 				empty = false;
1988 
1989 			q->cur_tin++;
1990 			b++;
1991 			if (q->cur_tin >= q->tin_cnt) {
1992 				q->cur_tin = 0;
1993 				b = q->tins;
1994 
1995 				if (wrapped) {
1996 					/* It's possible for q->qlen to be
1997 					 * nonzero when we actually have no
1998 					 * packets anywhere.
1999 					 */
2000 					if (empty)
2001 						return NULL;
2002 				} else {
2003 					wrapped = true;
2004 				}
2005 			}
2006 		}
2007 	} else {
2008 		/* In shaped mode, choose:
2009 		 * - Highest-priority tin with queue and meeting schedule, or
2010 		 * - The earliest-scheduled tin with queue.
2011 		 */
2012 		ktime_t best_time = KTIME_MAX;
2013 		int tin, best_tin = 0;
2014 
2015 		for (tin = 0; tin < q->tin_cnt; tin++) {
2016 			b = q->tins + tin;
2017 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2018 				ktime_t time_to_pkt = \
2019 					ktime_sub(b->time_next_packet, now);
2020 
2021 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2022 				    ktime_compare(time_to_pkt,
2023 						  best_time) <= 0) {
2024 					best_time = time_to_pkt;
2025 					best_tin = tin;
2026 				}
2027 			}
2028 		}
2029 
2030 		q->cur_tin = best_tin;
2031 		b = q->tins + best_tin;
2032 
2033 		/* No point in going further if no packets to deliver. */
2034 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2035 			return NULL;
2036 	}
2037 
2038 retry:
2039 	/* service this class */
2040 	head = &b->decaying_flows;
2041 	if (!first_flow || list_empty(head)) {
2042 		head = &b->new_flows;
2043 		if (list_empty(head)) {
2044 			head = &b->old_flows;
2045 			if (unlikely(list_empty(head))) {
2046 				head = &b->decaying_flows;
2047 				if (unlikely(list_empty(head)))
2048 					goto begin;
2049 			}
2050 		}
2051 	}
2052 	flow = list_first_entry(head, struct cake_flow, flowchain);
2053 	q->cur_flow = flow - b->flows;
2054 	first_flow = false;
2055 
2056 	/* triple isolation (modified DRR++) */
2057 	srchost = &b->hosts[flow->srchost];
2058 	dsthost = &b->hosts[flow->dsthost];
2059 	host_load = 1;
2060 
2061 	/* flow isolation (DRR++) */
2062 	if (flow->deficit <= 0) {
2063 		/* Keep all flows with deficits out of the sparse and decaying
2064 		 * rotations.  No non-empty flow can go into the decaying
2065 		 * rotation, so they can't get deficits
2066 		 */
2067 		if (flow->set == CAKE_SET_SPARSE) {
2068 			if (flow->head) {
2069 				b->sparse_flow_count--;
2070 				b->bulk_flow_count++;
2071 
2072 				if (cake_dsrc(q->flow_mode))
2073 					srchost->srchost_bulk_flow_count++;
2074 
2075 				if (cake_ddst(q->flow_mode))
2076 					dsthost->dsthost_bulk_flow_count++;
2077 
2078 				flow->set = CAKE_SET_BULK;
2079 			} else {
2080 				/* we've moved it to the bulk rotation for
2081 				 * correct deficit accounting but we still want
2082 				 * to count it as a sparse flow, not a bulk one.
2083 				 */
2084 				flow->set = CAKE_SET_SPARSE_WAIT;
2085 			}
2086 		}
2087 
2088 		if (cake_dsrc(q->flow_mode))
2089 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2090 
2091 		if (cake_ddst(q->flow_mode))
2092 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2093 
2094 		WARN_ON(host_load > CAKE_QUEUES);
2095 
2096 		/* The get_random_u16() is a way to apply dithering to avoid
2097 		 * accumulating roundoff errors
2098 		 */
2099 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2100 				  get_random_u16()) >> 16;
2101 		list_move_tail(&flow->flowchain, &b->old_flows);
2102 
2103 		goto retry;
2104 	}
2105 
2106 	/* Retrieve a packet via the AQM */
2107 	while (1) {
2108 		skb = cake_dequeue_one(sch);
2109 		if (!skb) {
2110 			/* this queue was actually empty */
2111 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2112 				b->unresponsive_flow_count--;
2113 
2114 			if (flow->cvars.p_drop || flow->cvars.count ||
2115 			    ktime_before(now, flow->cvars.drop_next)) {
2116 				/* keep in the flowchain until the state has
2117 				 * decayed to rest
2118 				 */
2119 				list_move_tail(&flow->flowchain,
2120 					       &b->decaying_flows);
2121 				if (flow->set == CAKE_SET_BULK) {
2122 					b->bulk_flow_count--;
2123 
2124 					if (cake_dsrc(q->flow_mode))
2125 						srchost->srchost_bulk_flow_count--;
2126 
2127 					if (cake_ddst(q->flow_mode))
2128 						dsthost->dsthost_bulk_flow_count--;
2129 
2130 					b->decaying_flow_count++;
2131 				} else if (flow->set == CAKE_SET_SPARSE ||
2132 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2133 					b->sparse_flow_count--;
2134 					b->decaying_flow_count++;
2135 				}
2136 				flow->set = CAKE_SET_DECAYING;
2137 			} else {
2138 				/* remove empty queue from the flowchain */
2139 				list_del_init(&flow->flowchain);
2140 				if (flow->set == CAKE_SET_SPARSE ||
2141 				    flow->set == CAKE_SET_SPARSE_WAIT)
2142 					b->sparse_flow_count--;
2143 				else if (flow->set == CAKE_SET_BULK) {
2144 					b->bulk_flow_count--;
2145 
2146 					if (cake_dsrc(q->flow_mode))
2147 						srchost->srchost_bulk_flow_count--;
2148 
2149 					if (cake_ddst(q->flow_mode))
2150 						dsthost->dsthost_bulk_flow_count--;
2151 
2152 				} else
2153 					b->decaying_flow_count--;
2154 
2155 				flow->set = CAKE_SET_NONE;
2156 			}
2157 			goto begin;
2158 		}
2159 
2160 		/* Last packet in queue may be marked, shouldn't be dropped */
2161 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2162 					(b->bulk_flow_count *
2163 					 !!(q->rate_flags &
2164 					    CAKE_FLAG_INGRESS))) ||
2165 		    !flow->head)
2166 			break;
2167 
2168 		/* drop this packet, get another one */
2169 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2170 			len = cake_advance_shaper(q, b, skb,
2171 						  now, true);
2172 			flow->deficit -= len;
2173 			b->tin_deficit -= len;
2174 		}
2175 		flow->dropped++;
2176 		b->tin_dropped++;
2177 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2178 		qdisc_qstats_drop(sch);
2179 		kfree_skb(skb);
2180 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2181 			goto retry;
2182 	}
2183 
2184 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2185 	qdisc_bstats_update(sch, skb);
2186 
2187 	/* collect delay stats */
2188 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2189 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2190 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2191 				  delay > b->peak_delay ? 2 : 8);
2192 	b->base_delay = cake_ewma(b->base_delay, delay,
2193 				  delay < b->base_delay ? 2 : 8);
2194 
2195 	len = cake_advance_shaper(q, b, skb, now, false);
2196 	flow->deficit -= len;
2197 	b->tin_deficit -= len;
2198 
2199 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2200 		u64 next = min(ktime_to_ns(q->time_next_packet),
2201 			       ktime_to_ns(q->failsafe_next_packet));
2202 
2203 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2204 	} else if (!sch->q.qlen) {
2205 		int i;
2206 
2207 		for (i = 0; i < q->tin_cnt; i++) {
2208 			if (q->tins[i].decaying_flow_count) {
2209 				ktime_t next = \
2210 					ktime_add_ns(now,
2211 						     q->tins[i].cparams.target);
2212 
2213 				qdisc_watchdog_schedule_ns(&q->watchdog,
2214 							   ktime_to_ns(next));
2215 				break;
2216 			}
2217 		}
2218 	}
2219 
2220 	if (q->overflow_timeout)
2221 		q->overflow_timeout--;
2222 
2223 	return skb;
2224 }
2225 
cake_reset(struct Qdisc * sch)2226 static void cake_reset(struct Qdisc *sch)
2227 {
2228 	struct cake_sched_data *q = qdisc_priv(sch);
2229 	u32 c;
2230 
2231 	if (!q->tins)
2232 		return;
2233 
2234 	for (c = 0; c < CAKE_MAX_TINS; c++)
2235 		cake_clear_tin(sch, c);
2236 }
2237 
2238 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2239 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2240 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2241 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2242 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2243 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2244 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2245 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2246 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2247 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2248 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2249 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2250 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2251 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2252 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2253 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2254 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2255 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2256 };
2257 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2258 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2259 			  u64 target_ns, u64 rtt_est_ns)
2260 {
2261 	/* convert byte-rate into time-per-byte
2262 	 * so it will always unwedge in reasonable time.
2263 	 */
2264 	static const u64 MIN_RATE = 64;
2265 	u32 byte_target = mtu;
2266 	u64 byte_target_ns;
2267 	u8  rate_shft = 0;
2268 	u64 rate_ns = 0;
2269 
2270 	b->flow_quantum = 1514;
2271 	if (rate) {
2272 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2273 		rate_shft = 34;
2274 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2275 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2276 		while (!!(rate_ns >> 34)) {
2277 			rate_ns >>= 1;
2278 			rate_shft--;
2279 		}
2280 	} /* else unlimited, ie. zero delay */
2281 
2282 	b->tin_rate_bps  = rate;
2283 	b->tin_rate_ns   = rate_ns;
2284 	b->tin_rate_shft = rate_shft;
2285 
2286 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2287 
2288 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2289 	b->cparams.interval = max(rtt_est_ns +
2290 				     b->cparams.target - target_ns,
2291 				     b->cparams.target * 2);
2292 	b->cparams.mtu_time = byte_target_ns;
2293 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2294 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2295 }
2296 
cake_config_besteffort(struct Qdisc * sch)2297 static int cake_config_besteffort(struct Qdisc *sch)
2298 {
2299 	struct cake_sched_data *q = qdisc_priv(sch);
2300 	struct cake_tin_data *b = &q->tins[0];
2301 	u32 mtu = psched_mtu(qdisc_dev(sch));
2302 	u64 rate = q->rate_bps;
2303 
2304 	q->tin_cnt = 1;
2305 
2306 	q->tin_index = besteffort;
2307 	q->tin_order = normal_order;
2308 
2309 	cake_set_rate(b, rate, mtu,
2310 		      us_to_ns(q->target), us_to_ns(q->interval));
2311 	b->tin_quantum = 65535;
2312 
2313 	return 0;
2314 }
2315 
cake_config_precedence(struct Qdisc * sch)2316 static int cake_config_precedence(struct Qdisc *sch)
2317 {
2318 	/* convert high-level (user visible) parameters into internal format */
2319 	struct cake_sched_data *q = qdisc_priv(sch);
2320 	u32 mtu = psched_mtu(qdisc_dev(sch));
2321 	u64 rate = q->rate_bps;
2322 	u32 quantum = 256;
2323 	u32 i;
2324 
2325 	q->tin_cnt = 8;
2326 	q->tin_index = precedence;
2327 	q->tin_order = normal_order;
2328 
2329 	for (i = 0; i < q->tin_cnt; i++) {
2330 		struct cake_tin_data *b = &q->tins[i];
2331 
2332 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2333 			      us_to_ns(q->interval));
2334 
2335 		b->tin_quantum = max_t(u16, 1U, quantum);
2336 
2337 		/* calculate next class's parameters */
2338 		rate  *= 7;
2339 		rate >>= 3;
2340 
2341 		quantum  *= 7;
2342 		quantum >>= 3;
2343 	}
2344 
2345 	return 0;
2346 }
2347 
2348 /*	List of known Diffserv codepoints:
2349  *
2350  *	Default Forwarding (DF/CS0) - Best Effort
2351  *	Max Throughput (TOS2)
2352  *	Min Delay (TOS4)
2353  *	LLT "La" (TOS5)
2354  *	Assured Forwarding 1 (AF1x) - x3
2355  *	Assured Forwarding 2 (AF2x) - x3
2356  *	Assured Forwarding 3 (AF3x) - x3
2357  *	Assured Forwarding 4 (AF4x) - x3
2358  *	Precedence Class 1 (CS1)
2359  *	Precedence Class 2 (CS2)
2360  *	Precedence Class 3 (CS3)
2361  *	Precedence Class 4 (CS4)
2362  *	Precedence Class 5 (CS5)
2363  *	Precedence Class 6 (CS6)
2364  *	Precedence Class 7 (CS7)
2365  *	Voice Admit (VA)
2366  *	Expedited Forwarding (EF)
2367  *	Lower Effort (LE)
2368  *
2369  *	Total 26 codepoints.
2370  */
2371 
2372 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2373  *		(roughly descending order of contended priority)
2374  *		(roughly ascending order of uncontended throughput)
2375  *
2376  *	Network Control (CS6,CS7)      - routing traffic
2377  *	Telephony (EF,VA)         - aka. VoIP streams
2378  *	Signalling (CS5)               - VoIP setup
2379  *	Multimedia Conferencing (AF4x) - aka. video calls
2380  *	Realtime Interactive (CS4)     - eg. games
2381  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2382  *	Broadcast Video (CS3)
2383  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2384  *	Ops, Admin, Management (CS2)      - eg. ssh
2385  *	Standard Service (DF & unrecognised codepoints)
2386  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2387  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2388  *
2389  *	Total 12 traffic classes.
2390  */
2391 
cake_config_diffserv8(struct Qdisc * sch)2392 static int cake_config_diffserv8(struct Qdisc *sch)
2393 {
2394 /*	Pruned list of traffic classes for typical applications:
2395  *
2396  *		Network Control          (CS6, CS7)
2397  *		Minimum Latency          (EF, VA, CS5, CS4)
2398  *		Interactive Shell        (CS2)
2399  *		Low Latency Transactions (AF2x, TOS4)
2400  *		Video Streaming          (AF4x, AF3x, CS3)
2401  *		Bog Standard             (DF etc.)
2402  *		High Throughput          (AF1x, TOS2, CS1)
2403  *		Background Traffic       (LE)
2404  *
2405  *		Total 8 traffic classes.
2406  */
2407 
2408 	struct cake_sched_data *q = qdisc_priv(sch);
2409 	u32 mtu = psched_mtu(qdisc_dev(sch));
2410 	u64 rate = q->rate_bps;
2411 	u32 quantum = 256;
2412 	u32 i;
2413 
2414 	q->tin_cnt = 8;
2415 
2416 	/* codepoint to class mapping */
2417 	q->tin_index = diffserv8;
2418 	q->tin_order = normal_order;
2419 
2420 	/* class characteristics */
2421 	for (i = 0; i < q->tin_cnt; i++) {
2422 		struct cake_tin_data *b = &q->tins[i];
2423 
2424 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2425 			      us_to_ns(q->interval));
2426 
2427 		b->tin_quantum = max_t(u16, 1U, quantum);
2428 
2429 		/* calculate next class's parameters */
2430 		rate  *= 7;
2431 		rate >>= 3;
2432 
2433 		quantum  *= 7;
2434 		quantum >>= 3;
2435 	}
2436 
2437 	return 0;
2438 }
2439 
cake_config_diffserv4(struct Qdisc * sch)2440 static int cake_config_diffserv4(struct Qdisc *sch)
2441 {
2442 /*  Further pruned list of traffic classes for four-class system:
2443  *
2444  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2445  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2446  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2447  *	    Background Traffic (LE, CS1)
2448  *
2449  *		Total 4 traffic classes.
2450  */
2451 
2452 	struct cake_sched_data *q = qdisc_priv(sch);
2453 	u32 mtu = psched_mtu(qdisc_dev(sch));
2454 	u64 rate = q->rate_bps;
2455 	u32 quantum = 1024;
2456 
2457 	q->tin_cnt = 4;
2458 
2459 	/* codepoint to class mapping */
2460 	q->tin_index = diffserv4;
2461 	q->tin_order = bulk_order;
2462 
2463 	/* class characteristics */
2464 	cake_set_rate(&q->tins[0], rate, mtu,
2465 		      us_to_ns(q->target), us_to_ns(q->interval));
2466 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2467 		      us_to_ns(q->target), us_to_ns(q->interval));
2468 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2469 		      us_to_ns(q->target), us_to_ns(q->interval));
2470 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2471 		      us_to_ns(q->target), us_to_ns(q->interval));
2472 
2473 	/* bandwidth-sharing weights */
2474 	q->tins[0].tin_quantum = quantum;
2475 	q->tins[1].tin_quantum = quantum >> 4;
2476 	q->tins[2].tin_quantum = quantum >> 1;
2477 	q->tins[3].tin_quantum = quantum >> 2;
2478 
2479 	return 0;
2480 }
2481 
cake_config_diffserv3(struct Qdisc * sch)2482 static int cake_config_diffserv3(struct Qdisc *sch)
2483 {
2484 /*  Simplified Diffserv structure with 3 tins.
2485  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2486  *		Best Effort
2487  *		Low Priority		(LE, CS1)
2488  */
2489 	struct cake_sched_data *q = qdisc_priv(sch);
2490 	u32 mtu = psched_mtu(qdisc_dev(sch));
2491 	u64 rate = q->rate_bps;
2492 	u32 quantum = 1024;
2493 
2494 	q->tin_cnt = 3;
2495 
2496 	/* codepoint to class mapping */
2497 	q->tin_index = diffserv3;
2498 	q->tin_order = bulk_order;
2499 
2500 	/* class characteristics */
2501 	cake_set_rate(&q->tins[0], rate, mtu,
2502 		      us_to_ns(q->target), us_to_ns(q->interval));
2503 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2504 		      us_to_ns(q->target), us_to_ns(q->interval));
2505 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2506 		      us_to_ns(q->target), us_to_ns(q->interval));
2507 
2508 	/* bandwidth-sharing weights */
2509 	q->tins[0].tin_quantum = quantum;
2510 	q->tins[1].tin_quantum = quantum >> 4;
2511 	q->tins[2].tin_quantum = quantum >> 2;
2512 
2513 	return 0;
2514 }
2515 
cake_reconfigure(struct Qdisc * sch)2516 static void cake_reconfigure(struct Qdisc *sch)
2517 {
2518 	struct cake_sched_data *q = qdisc_priv(sch);
2519 	int c, ft;
2520 
2521 	switch (q->tin_mode) {
2522 	case CAKE_DIFFSERV_BESTEFFORT:
2523 		ft = cake_config_besteffort(sch);
2524 		break;
2525 
2526 	case CAKE_DIFFSERV_PRECEDENCE:
2527 		ft = cake_config_precedence(sch);
2528 		break;
2529 
2530 	case CAKE_DIFFSERV_DIFFSERV8:
2531 		ft = cake_config_diffserv8(sch);
2532 		break;
2533 
2534 	case CAKE_DIFFSERV_DIFFSERV4:
2535 		ft = cake_config_diffserv4(sch);
2536 		break;
2537 
2538 	case CAKE_DIFFSERV_DIFFSERV3:
2539 	default:
2540 		ft = cake_config_diffserv3(sch);
2541 		break;
2542 	}
2543 
2544 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2545 		cake_clear_tin(sch, c);
2546 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2547 	}
2548 
2549 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2550 	q->rate_shft = q->tins[ft].tin_rate_shft;
2551 
2552 	if (q->buffer_config_limit) {
2553 		q->buffer_limit = q->buffer_config_limit;
2554 	} else if (q->rate_bps) {
2555 		u64 t = q->rate_bps * q->interval;
2556 
2557 		do_div(t, USEC_PER_SEC / 4);
2558 		q->buffer_limit = max_t(u32, t, 4U << 20);
2559 	} else {
2560 		q->buffer_limit = ~0;
2561 	}
2562 
2563 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2564 
2565 	q->buffer_limit = min(q->buffer_limit,
2566 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2567 				  q->buffer_config_limit));
2568 }
2569 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2570 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2571 		       struct netlink_ext_ack *extack)
2572 {
2573 	struct cake_sched_data *q = qdisc_priv(sch);
2574 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2575 	int err;
2576 
2577 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2578 					  extack);
2579 	if (err < 0)
2580 		return err;
2581 
2582 	if (tb[TCA_CAKE_NAT]) {
2583 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2584 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2585 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2586 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2587 #else
2588 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2589 				    "No conntrack support in kernel");
2590 		return -EOPNOTSUPP;
2591 #endif
2592 	}
2593 
2594 	if (tb[TCA_CAKE_BASE_RATE64])
2595 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2596 
2597 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2598 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2599 
2600 	if (tb[TCA_CAKE_WASH]) {
2601 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2602 			q->rate_flags |= CAKE_FLAG_WASH;
2603 		else
2604 			q->rate_flags &= ~CAKE_FLAG_WASH;
2605 	}
2606 
2607 	if (tb[TCA_CAKE_FLOW_MODE])
2608 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2609 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2610 					CAKE_FLOW_MASK));
2611 
2612 	if (tb[TCA_CAKE_ATM])
2613 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2614 
2615 	if (tb[TCA_CAKE_OVERHEAD]) {
2616 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2617 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2618 
2619 		q->max_netlen = 0;
2620 		q->max_adjlen = 0;
2621 		q->min_netlen = ~0;
2622 		q->min_adjlen = ~0;
2623 	}
2624 
2625 	if (tb[TCA_CAKE_RAW]) {
2626 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2627 
2628 		q->max_netlen = 0;
2629 		q->max_adjlen = 0;
2630 		q->min_netlen = ~0;
2631 		q->min_adjlen = ~0;
2632 	}
2633 
2634 	if (tb[TCA_CAKE_MPU])
2635 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2636 
2637 	if (tb[TCA_CAKE_RTT]) {
2638 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2639 
2640 		if (!q->interval)
2641 			q->interval = 1;
2642 	}
2643 
2644 	if (tb[TCA_CAKE_TARGET]) {
2645 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2646 
2647 		if (!q->target)
2648 			q->target = 1;
2649 	}
2650 
2651 	if (tb[TCA_CAKE_AUTORATE]) {
2652 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2653 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2654 		else
2655 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2656 	}
2657 
2658 	if (tb[TCA_CAKE_INGRESS]) {
2659 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2660 			q->rate_flags |= CAKE_FLAG_INGRESS;
2661 		else
2662 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2663 	}
2664 
2665 	if (tb[TCA_CAKE_ACK_FILTER])
2666 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2667 
2668 	if (tb[TCA_CAKE_MEMORY])
2669 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2670 
2671 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2672 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2673 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2674 		else
2675 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2676 	}
2677 
2678 	if (tb[TCA_CAKE_FWMARK]) {
2679 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2680 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2681 	}
2682 
2683 	if (q->tins) {
2684 		sch_tree_lock(sch);
2685 		cake_reconfigure(sch);
2686 		sch_tree_unlock(sch);
2687 	}
2688 
2689 	return 0;
2690 }
2691 
cake_destroy(struct Qdisc * sch)2692 static void cake_destroy(struct Qdisc *sch)
2693 {
2694 	struct cake_sched_data *q = qdisc_priv(sch);
2695 
2696 	qdisc_watchdog_cancel(&q->watchdog);
2697 	tcf_block_put(q->block);
2698 	kvfree(q->tins);
2699 }
2700 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2701 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2702 		     struct netlink_ext_ack *extack)
2703 {
2704 	struct cake_sched_data *q = qdisc_priv(sch);
2705 	int i, j, err;
2706 
2707 	sch->limit = 10240;
2708 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2709 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2710 
2711 	q->rate_bps = 0; /* unlimited by default */
2712 
2713 	q->interval = 100000; /* 100ms default */
2714 	q->target   =   5000; /* 5ms: codel RFC argues
2715 			       * for 5 to 10% of interval
2716 			       */
2717 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2718 	q->cur_tin = 0;
2719 	q->cur_flow  = 0;
2720 
2721 	qdisc_watchdog_init(&q->watchdog, sch);
2722 
2723 	if (opt) {
2724 		err = cake_change(sch, opt, extack);
2725 
2726 		if (err)
2727 			return err;
2728 	}
2729 
2730 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2731 	if (err)
2732 		return err;
2733 
2734 	quantum_div[0] = ~0;
2735 	for (i = 1; i <= CAKE_QUEUES; i++)
2736 		quantum_div[i] = 65535 / i;
2737 
2738 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2739 			   GFP_KERNEL);
2740 	if (!q->tins)
2741 		return -ENOMEM;
2742 
2743 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2744 		struct cake_tin_data *b = q->tins + i;
2745 
2746 		INIT_LIST_HEAD(&b->new_flows);
2747 		INIT_LIST_HEAD(&b->old_flows);
2748 		INIT_LIST_HEAD(&b->decaying_flows);
2749 		b->sparse_flow_count = 0;
2750 		b->bulk_flow_count = 0;
2751 		b->decaying_flow_count = 0;
2752 
2753 		for (j = 0; j < CAKE_QUEUES; j++) {
2754 			struct cake_flow *flow = b->flows + j;
2755 			u32 k = j * CAKE_MAX_TINS + i;
2756 
2757 			INIT_LIST_HEAD(&flow->flowchain);
2758 			cobalt_vars_init(&flow->cvars);
2759 
2760 			q->overflow_heap[k].t = i;
2761 			q->overflow_heap[k].b = j;
2762 			b->overflow_idx[j] = k;
2763 		}
2764 	}
2765 
2766 	cake_reconfigure(sch);
2767 	q->avg_peak_bandwidth = q->rate_bps;
2768 	q->min_netlen = ~0;
2769 	q->min_adjlen = ~0;
2770 	return 0;
2771 }
2772 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2773 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2774 {
2775 	struct cake_sched_data *q = qdisc_priv(sch);
2776 	struct nlattr *opts;
2777 
2778 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2779 	if (!opts)
2780 		goto nla_put_failure;
2781 
2782 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2783 			      TCA_CAKE_PAD))
2784 		goto nla_put_failure;
2785 
2786 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2787 			q->flow_mode & CAKE_FLOW_MASK))
2788 		goto nla_put_failure;
2789 
2790 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2791 		goto nla_put_failure;
2792 
2793 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2794 		goto nla_put_failure;
2795 
2796 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2797 		goto nla_put_failure;
2798 
2799 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2800 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2801 		goto nla_put_failure;
2802 
2803 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2804 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2805 		goto nla_put_failure;
2806 
2807 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2808 		goto nla_put_failure;
2809 
2810 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2811 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2812 		goto nla_put_failure;
2813 
2814 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2815 		goto nla_put_failure;
2816 
2817 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2818 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2819 		goto nla_put_failure;
2820 
2821 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2822 		goto nla_put_failure;
2823 
2824 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2825 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2826 			goto nla_put_failure;
2827 
2828 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2829 		goto nla_put_failure;
2830 
2831 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2832 		goto nla_put_failure;
2833 
2834 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2835 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2836 		goto nla_put_failure;
2837 
2838 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2839 		goto nla_put_failure;
2840 
2841 	return nla_nest_end(skb, opts);
2842 
2843 nla_put_failure:
2844 	return -1;
2845 }
2846 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2847 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2848 {
2849 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2850 	struct cake_sched_data *q = qdisc_priv(sch);
2851 	struct nlattr *tstats, *ts;
2852 	int i;
2853 
2854 	if (!stats)
2855 		return -1;
2856 
2857 #define PUT_STAT_U32(attr, data) do {				       \
2858 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2859 			goto nla_put_failure;			       \
2860 	} while (0)
2861 #define PUT_STAT_U64(attr, data) do {				       \
2862 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2863 					data, TCA_CAKE_STATS_PAD)) \
2864 			goto nla_put_failure;			       \
2865 	} while (0)
2866 
2867 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2868 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2869 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2870 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2871 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2872 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2873 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2874 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2875 
2876 #undef PUT_STAT_U32
2877 #undef PUT_STAT_U64
2878 
2879 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2880 	if (!tstats)
2881 		goto nla_put_failure;
2882 
2883 #define PUT_TSTAT_U32(attr, data) do {					\
2884 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2885 			goto nla_put_failure;				\
2886 	} while (0)
2887 #define PUT_TSTAT_U64(attr, data) do {					\
2888 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2889 					data, TCA_CAKE_TIN_STATS_PAD))	\
2890 			goto nla_put_failure;				\
2891 	} while (0)
2892 
2893 	for (i = 0; i < q->tin_cnt; i++) {
2894 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2895 
2896 		ts = nla_nest_start_noflag(d->skb, i + 1);
2897 		if (!ts)
2898 			goto nla_put_failure;
2899 
2900 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2901 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2902 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2903 
2904 		PUT_TSTAT_U32(TARGET_US,
2905 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2906 		PUT_TSTAT_U32(INTERVAL_US,
2907 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2908 
2909 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2910 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2911 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2912 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2913 
2914 		PUT_TSTAT_U32(PEAK_DELAY_US,
2915 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2916 		PUT_TSTAT_U32(AVG_DELAY_US,
2917 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2918 		PUT_TSTAT_U32(BASE_DELAY_US,
2919 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2920 
2921 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2922 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2923 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2924 
2925 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2926 					    b->decaying_flow_count);
2927 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2928 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2929 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2930 
2931 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2932 		nla_nest_end(d->skb, ts);
2933 	}
2934 
2935 #undef PUT_TSTAT_U32
2936 #undef PUT_TSTAT_U64
2937 
2938 	nla_nest_end(d->skb, tstats);
2939 	return nla_nest_end(d->skb, stats);
2940 
2941 nla_put_failure:
2942 	nla_nest_cancel(d->skb, stats);
2943 	return -1;
2944 }
2945 
cake_leaf(struct Qdisc * sch,unsigned long arg)2946 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2947 {
2948 	return NULL;
2949 }
2950 
cake_find(struct Qdisc * sch,u32 classid)2951 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2952 {
2953 	return 0;
2954 }
2955 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2956 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2957 			       u32 classid)
2958 {
2959 	return 0;
2960 }
2961 
cake_unbind(struct Qdisc * q,unsigned long cl)2962 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2963 {
2964 }
2965 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2966 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2967 					struct netlink_ext_ack *extack)
2968 {
2969 	struct cake_sched_data *q = qdisc_priv(sch);
2970 
2971 	if (cl)
2972 		return NULL;
2973 	return q->block;
2974 }
2975 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2976 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2977 			   struct sk_buff *skb, struct tcmsg *tcm)
2978 {
2979 	tcm->tcm_handle |= TC_H_MIN(cl);
2980 	return 0;
2981 }
2982 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2983 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2984 				 struct gnet_dump *d)
2985 {
2986 	struct cake_sched_data *q = qdisc_priv(sch);
2987 	const struct cake_flow *flow = NULL;
2988 	struct gnet_stats_queue qs = { 0 };
2989 	struct nlattr *stats;
2990 	u32 idx = cl - 1;
2991 
2992 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2993 		const struct cake_tin_data *b = \
2994 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2995 		const struct sk_buff *skb;
2996 
2997 		flow = &b->flows[idx % CAKE_QUEUES];
2998 
2999 		if (flow->head) {
3000 			sch_tree_lock(sch);
3001 			skb = flow->head;
3002 			while (skb) {
3003 				qs.qlen++;
3004 				skb = skb->next;
3005 			}
3006 			sch_tree_unlock(sch);
3007 		}
3008 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3009 		qs.drops = flow->dropped;
3010 	}
3011 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3012 		return -1;
3013 	if (flow) {
3014 		ktime_t now = ktime_get();
3015 
3016 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3017 		if (!stats)
3018 			return -1;
3019 
3020 #define PUT_STAT_U32(attr, data) do {				       \
3021 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3022 			goto nla_put_failure;			       \
3023 	} while (0)
3024 #define PUT_STAT_S32(attr, data) do {				       \
3025 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3026 			goto nla_put_failure;			       \
3027 	} while (0)
3028 
3029 		PUT_STAT_S32(DEFICIT, flow->deficit);
3030 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3031 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3032 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3033 		if (flow->cvars.p_drop) {
3034 			PUT_STAT_S32(BLUE_TIMER_US,
3035 				     ktime_to_us(
3036 					     ktime_sub(now,
3037 						       flow->cvars.blue_timer)));
3038 		}
3039 		if (flow->cvars.dropping) {
3040 			PUT_STAT_S32(DROP_NEXT_US,
3041 				     ktime_to_us(
3042 					     ktime_sub(now,
3043 						       flow->cvars.drop_next)));
3044 		}
3045 
3046 		if (nla_nest_end(d->skb, stats) < 0)
3047 			return -1;
3048 	}
3049 
3050 	return 0;
3051 
3052 nla_put_failure:
3053 	nla_nest_cancel(d->skb, stats);
3054 	return -1;
3055 }
3056 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3057 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3058 {
3059 	struct cake_sched_data *q = qdisc_priv(sch);
3060 	unsigned int i, j;
3061 
3062 	if (arg->stop)
3063 		return;
3064 
3065 	for (i = 0; i < q->tin_cnt; i++) {
3066 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3067 
3068 		for (j = 0; j < CAKE_QUEUES; j++) {
3069 			if (list_empty(&b->flows[j].flowchain)) {
3070 				arg->count++;
3071 				continue;
3072 			}
3073 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3074 						 arg))
3075 				break;
3076 		}
3077 	}
3078 }
3079 
3080 static const struct Qdisc_class_ops cake_class_ops = {
3081 	.leaf		=	cake_leaf,
3082 	.find		=	cake_find,
3083 	.tcf_block	=	cake_tcf_block,
3084 	.bind_tcf	=	cake_bind,
3085 	.unbind_tcf	=	cake_unbind,
3086 	.dump		=	cake_dump_class,
3087 	.dump_stats	=	cake_dump_class_stats,
3088 	.walk		=	cake_walk,
3089 };
3090 
3091 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3092 	.cl_ops		=	&cake_class_ops,
3093 	.id		=	"cake",
3094 	.priv_size	=	sizeof(struct cake_sched_data),
3095 	.enqueue	=	cake_enqueue,
3096 	.dequeue	=	cake_dequeue,
3097 	.peek		=	qdisc_peek_dequeued,
3098 	.init		=	cake_init,
3099 	.reset		=	cake_reset,
3100 	.destroy	=	cake_destroy,
3101 	.change		=	cake_change,
3102 	.dump		=	cake_dump,
3103 	.dump_stats	=	cake_dump_stats,
3104 	.owner		=	THIS_MODULE,
3105 };
3106 
cake_module_init(void)3107 static int __init cake_module_init(void)
3108 {
3109 	return register_qdisc(&cake_qdisc_ops);
3110 }
3111 
cake_module_exit(void)3112 static void __exit cake_module_exit(void)
3113 {
3114 	unregister_qdisc(&cake_qdisc_ops);
3115 }
3116 
3117 module_init(cake_module_init)
3118 module_exit(cake_module_exit)
3119 MODULE_AUTHOR("Jonathan Morton");
3120 MODULE_LICENSE("Dual BSD/GPL");
3121 MODULE_DESCRIPTION("The CAKE shaper.");
3122