xref: /openbmc/linux/net/sched/sch_cake.c (revision d32fd6bb9f2bc8178cdd65ebec1ad670a8bfa241)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 #define REC_INV_SQRT_CACHE (16)
365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366 
367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369  *
370  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371  */
372 
cobalt_newton_step(struct cobalt_vars * vars)373 static void cobalt_newton_step(struct cobalt_vars *vars)
374 {
375 	u32 invsqrt, invsqrt2;
376 	u64 val;
377 
378 	invsqrt = vars->rec_inv_sqrt;
379 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381 
382 	val >>= 2; /* avoid overflow in following multiply */
383 	val = (val * invsqrt) >> (32 - 2 + 1);
384 
385 	vars->rec_inv_sqrt = val;
386 }
387 
cobalt_invsqrt(struct cobalt_vars * vars)388 static void cobalt_invsqrt(struct cobalt_vars *vars)
389 {
390 	if (vars->count < REC_INV_SQRT_CACHE)
391 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392 	else
393 		cobalt_newton_step(vars);
394 }
395 
396 /* There is a big difference in timing between the accurate values placed in
397  * the cache and the approximations given by a single Newton step for small
398  * count values, particularly when stepping from count 1 to 2 or vice versa.
399  * Above 16, a single Newton step gives sufficient accuracy in either
400  * direction, given the precision stored.
401  *
402  * The magnitude of the error when stepping up to count 2 is such as to give
403  * the value that *should* have been produced at count 4.
404  */
405 
cobalt_cache_init(void)406 static void cobalt_cache_init(void)
407 {
408 	struct cobalt_vars v;
409 
410 	memset(&v, 0, sizeof(v));
411 	v.rec_inv_sqrt = ~0U;
412 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413 
414 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 		cobalt_newton_step(&v);
419 
420 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421 	}
422 }
423 
cobalt_vars_init(struct cobalt_vars * vars)424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 	memset(vars, 0, sizeof(*vars));
427 
428 	if (!cobalt_rec_inv_sqrt_cache[0]) {
429 		cobalt_cache_init();
430 		cobalt_rec_inv_sqrt_cache[0] = ~0;
431 	}
432 }
433 
434 /* CoDel control_law is t + interval/sqrt(count)
435  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436  * both sqrt() and divide operation.
437  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438 static ktime_t cobalt_control(ktime_t t,
439 			      u64 interval,
440 			      u32 rec_inv_sqrt)
441 {
442 	return ktime_add_ns(t, reciprocal_scale(interval,
443 						rec_inv_sqrt));
444 }
445 
446 /* Call this when a packet had to be dropped due to queue overflow.  Returns
447  * true if the BLUE state was quiescent before but active after this call.
448  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449 static bool cobalt_queue_full(struct cobalt_vars *vars,
450 			      struct cobalt_params *p,
451 			      ktime_t now)
452 {
453 	bool up = false;
454 
455 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456 		up = !vars->p_drop;
457 		vars->p_drop += p->p_inc;
458 		if (vars->p_drop < p->p_inc)
459 			vars->p_drop = ~0;
460 		vars->blue_timer = now;
461 	}
462 	vars->dropping = true;
463 	vars->drop_next = now;
464 	if (!vars->count)
465 		vars->count = 1;
466 
467 	return up;
468 }
469 
470 /* Call this when the queue was serviced but turned out to be empty.  Returns
471  * true if the BLUE state was active before but quiescent after this call.
472  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
474 			       struct cobalt_params *p,
475 			       ktime_t now)
476 {
477 	bool down = false;
478 
479 	if (vars->p_drop &&
480 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481 		if (vars->p_drop < p->p_dec)
482 			vars->p_drop = 0;
483 		else
484 			vars->p_drop -= p->p_dec;
485 		vars->blue_timer = now;
486 		down = !vars->p_drop;
487 	}
488 	vars->dropping = false;
489 
490 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491 		vars->count--;
492 		cobalt_invsqrt(vars);
493 		vars->drop_next = cobalt_control(vars->drop_next,
494 						 p->interval,
495 						 vars->rec_inv_sqrt);
496 	}
497 
498 	return down;
499 }
500 
501 /* Call this with a freshly dequeued packet for possible congestion marking.
502  * Returns true as an instruction to drop the packet, false for delivery.
503  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504 static bool cobalt_should_drop(struct cobalt_vars *vars,
505 			       struct cobalt_params *p,
506 			       ktime_t now,
507 			       struct sk_buff *skb,
508 			       u32 bulk_flows)
509 {
510 	bool next_due, over_target, drop = false;
511 	ktime_t schedule;
512 	u64 sojourn;
513 
514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
515  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516  * scheduling decision is actually branched, without destroying that
517  * information.  Similarly, the first 'schedule' value calculated is preserved
518  * in the boolean 'next_due'.
519  *
520  * As for 'drop_next', we take advantage of the fact that 'interval' is both
521  * the delay between first exceeding 'target' and the first signalling event,
522  * *and* the scaling factor for the signalling frequency.  It's therefore very
523  * natural to use a single mechanism for both purposes, and eliminates a
524  * significant amount of reference Codel's spaghetti code.  To help with this,
525  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526  * as possible to 1.0 in fixed-point.
527  */
528 
529 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530 	schedule = ktime_sub(now, vars->drop_next);
531 	over_target = sojourn > p->target &&
532 		      sojourn > p->mtu_time * bulk_flows * 2 &&
533 		      sojourn > p->mtu_time * 4;
534 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535 
536 	vars->ecn_marked = false;
537 
538 	if (over_target) {
539 		if (!vars->dropping) {
540 			vars->dropping = true;
541 			vars->drop_next = cobalt_control(now,
542 							 p->interval,
543 							 vars->rec_inv_sqrt);
544 		}
545 		if (!vars->count)
546 			vars->count = 1;
547 	} else if (vars->dropping) {
548 		vars->dropping = false;
549 	}
550 
551 	if (next_due && vars->dropping) {
552 		/* Use ECN mark if possible, otherwise drop */
553 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554 
555 		vars->count++;
556 		if (!vars->count)
557 			vars->count--;
558 		cobalt_invsqrt(vars);
559 		vars->drop_next = cobalt_control(vars->drop_next,
560 						 p->interval,
561 						 vars->rec_inv_sqrt);
562 		schedule = ktime_sub(now, vars->drop_next);
563 	} else {
564 		while (next_due) {
565 			vars->count--;
566 			cobalt_invsqrt(vars);
567 			vars->drop_next = cobalt_control(vars->drop_next,
568 							 p->interval,
569 							 vars->rec_inv_sqrt);
570 			schedule = ktime_sub(now, vars->drop_next);
571 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572 		}
573 	}
574 
575 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576 	if (vars->p_drop)
577 		drop |= (get_random_u32() < vars->p_drop);
578 
579 	/* Overload the drop_next field as an activity timeout */
580 	if (!vars->count)
581 		vars->drop_next = ktime_add_ns(now, p->interval);
582 	else if (ktime_to_ns(schedule) > 0 && !drop)
583 		vars->drop_next = now;
584 
585 	return drop;
586 }
587 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588 static bool cake_update_flowkeys(struct flow_keys *keys,
589 				 const struct sk_buff *skb)
590 {
591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592 	struct nf_conntrack_tuple tuple = {};
593 	bool rev = !skb->_nfct, upd = false;
594 	__be32 ip;
595 
596 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
597 		return false;
598 
599 	if (!nf_ct_get_tuple_skb(&tuple, skb))
600 		return false;
601 
602 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603 	if (ip != keys->addrs.v4addrs.src) {
604 		keys->addrs.v4addrs.src = ip;
605 		upd = true;
606 	}
607 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608 	if (ip != keys->addrs.v4addrs.dst) {
609 		keys->addrs.v4addrs.dst = ip;
610 		upd = true;
611 	}
612 
613 	if (keys->ports.ports) {
614 		__be16 port;
615 
616 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
617 		if (port != keys->ports.src) {
618 			keys->ports.src = port;
619 			upd = true;
620 		}
621 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
622 		if (port != keys->ports.dst) {
623 			port = keys->ports.dst;
624 			upd = true;
625 		}
626 	}
627 	return upd;
628 #else
629 	return false;
630 #endif
631 }
632 
633 /* Cake has several subtle multiple bit settings. In these cases you
634  *  would be matching triple isolate mode as well.
635  */
636 
cake_dsrc(int flow_mode)637 static bool cake_dsrc(int flow_mode)
638 {
639 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640 }
641 
cake_ddst(int flow_mode)642 static bool cake_ddst(int flow_mode)
643 {
644 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645 }
646 
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)647 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
648 					     struct cake_flow *flow,
649 					     int flow_mode)
650 {
651 	if (likely(cake_dsrc(flow_mode) &&
652 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
653 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
654 }
655 
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)656 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
657 					     struct cake_flow *flow,
658 					     int flow_mode)
659 {
660 	if (likely(cake_dsrc(flow_mode) &&
661 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
662 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
663 }
664 
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)665 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
666 					     struct cake_flow *flow,
667 					     int flow_mode)
668 {
669 	if (likely(cake_ddst(flow_mode) &&
670 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
671 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
672 }
673 
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)674 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
675 					     struct cake_flow *flow,
676 					     int flow_mode)
677 {
678 	if (likely(cake_ddst(flow_mode) &&
679 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
680 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
681 }
682 
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)683 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
684 				 struct cake_flow *flow,
685 				 int flow_mode)
686 {
687 	u16 host_load = 1;
688 
689 	if (cake_dsrc(flow_mode))
690 		host_load = max(host_load,
691 				q->hosts[flow->srchost].srchost_bulk_flow_count);
692 
693 	if (cake_ddst(flow_mode))
694 		host_load = max(host_load,
695 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
696 
697 	/* The get_random_u16() is a way to apply dithering to avoid
698 	 * accumulating roundoff errors
699 	 */
700 	return (q->flow_quantum * quantum_div[host_load] +
701 		get_random_u16()) >> 16;
702 }
703 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)704 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
705 		     int flow_mode, u16 flow_override, u16 host_override)
706 {
707 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
708 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
709 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
710 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
711 	u16 reduced_hash, srchost_idx, dsthost_idx;
712 	struct flow_keys keys, host_keys;
713 	bool use_skbhash = skb->l4_hash;
714 
715 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
716 		return 0;
717 
718 	/* If both overrides are set, or we can use the SKB hash and nat mode is
719 	 * disabled, we can skip packet dissection entirely. If nat mode is
720 	 * enabled there's another check below after doing the conntrack lookup.
721 	 */
722 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
723 		goto skip_hash;
724 
725 	skb_flow_dissect_flow_keys(skb, &keys,
726 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
727 
728 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
729 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
730 		use_skbhash = false;
731 
732 	/* If we can still use the SKB hash and don't need the host hash, we can
733 	 * skip the rest of the hashing procedure
734 	 */
735 	if (use_skbhash && !hash_hosts)
736 		goto skip_hash;
737 
738 	/* flow_hash_from_keys() sorts the addresses by value, so we have
739 	 * to preserve their order in a separate data structure to treat
740 	 * src and dst host addresses as independently selectable.
741 	 */
742 	host_keys = keys;
743 	host_keys.ports.ports     = 0;
744 	host_keys.basic.ip_proto  = 0;
745 	host_keys.keyid.keyid     = 0;
746 	host_keys.tags.flow_label = 0;
747 
748 	switch (host_keys.control.addr_type) {
749 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
750 		host_keys.addrs.v4addrs.src = 0;
751 		dsthost_hash = flow_hash_from_keys(&host_keys);
752 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
753 		host_keys.addrs.v4addrs.dst = 0;
754 		srchost_hash = flow_hash_from_keys(&host_keys);
755 		break;
756 
757 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
758 		memset(&host_keys.addrs.v6addrs.src, 0,
759 		       sizeof(host_keys.addrs.v6addrs.src));
760 		dsthost_hash = flow_hash_from_keys(&host_keys);
761 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
762 		memset(&host_keys.addrs.v6addrs.dst, 0,
763 		       sizeof(host_keys.addrs.v6addrs.dst));
764 		srchost_hash = flow_hash_from_keys(&host_keys);
765 		break;
766 
767 	default:
768 		dsthost_hash = 0;
769 		srchost_hash = 0;
770 	}
771 
772 	/* This *must* be after the above switch, since as a
773 	 * side-effect it sorts the src and dst addresses.
774 	 */
775 	if (hash_flows && !use_skbhash)
776 		flow_hash = flow_hash_from_keys(&keys);
777 
778 skip_hash:
779 	if (flow_override)
780 		flow_hash = flow_override - 1;
781 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
782 		flow_hash = skb->hash;
783 	if (host_override) {
784 		dsthost_hash = host_override - 1;
785 		srchost_hash = host_override - 1;
786 	}
787 
788 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
789 		if (flow_mode & CAKE_FLOW_SRC_IP)
790 			flow_hash ^= srchost_hash;
791 
792 		if (flow_mode & CAKE_FLOW_DST_IP)
793 			flow_hash ^= dsthost_hash;
794 	}
795 
796 	reduced_hash = flow_hash % CAKE_QUEUES;
797 
798 	/* set-associative hashing */
799 	/* fast path if no hash collision (direct lookup succeeds) */
800 	if (likely(q->tags[reduced_hash] == flow_hash &&
801 		   q->flows[reduced_hash].set)) {
802 		q->way_directs++;
803 	} else {
804 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
805 		u32 outer_hash = reduced_hash - inner_hash;
806 		bool allocate_src = false;
807 		bool allocate_dst = false;
808 		u32 i, k;
809 
810 		/* check if any active queue in the set is reserved for
811 		 * this flow.
812 		 */
813 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
814 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
815 			if (q->tags[outer_hash + k] == flow_hash) {
816 				if (i)
817 					q->way_hits++;
818 
819 				if (!q->flows[outer_hash + k].set) {
820 					/* need to increment host refcnts */
821 					allocate_src = cake_dsrc(flow_mode);
822 					allocate_dst = cake_ddst(flow_mode);
823 				}
824 
825 				goto found;
826 			}
827 		}
828 
829 		/* no queue is reserved for this flow, look for an
830 		 * empty one.
831 		 */
832 		for (i = 0; i < CAKE_SET_WAYS;
833 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
834 			if (!q->flows[outer_hash + k].set) {
835 				q->way_misses++;
836 				allocate_src = cake_dsrc(flow_mode);
837 				allocate_dst = cake_ddst(flow_mode);
838 				goto found;
839 			}
840 		}
841 
842 		/* With no empty queues, default to the original
843 		 * queue, accept the collision, update the host tags.
844 		 */
845 		q->way_collisions++;
846 		allocate_src = cake_dsrc(flow_mode);
847 		allocate_dst = cake_ddst(flow_mode);
848 
849 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
850 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
852 		}
853 found:
854 		/* reserve queue for future packets in same flow */
855 		reduced_hash = outer_hash + k;
856 		q->tags[reduced_hash] = flow_hash;
857 
858 		if (allocate_src) {
859 			srchost_idx = srchost_hash % CAKE_QUEUES;
860 			inner_hash = srchost_idx % CAKE_SET_WAYS;
861 			outer_hash = srchost_idx - inner_hash;
862 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
863 				i++, k = (k + 1) % CAKE_SET_WAYS) {
864 				if (q->hosts[outer_hash + k].srchost_tag ==
865 				    srchost_hash)
866 					goto found_src;
867 			}
868 			for (i = 0; i < CAKE_SET_WAYS;
869 				i++, k = (k + 1) % CAKE_SET_WAYS) {
870 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
871 					break;
872 			}
873 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
874 found_src:
875 			srchost_idx = outer_hash + k;
876 			q->flows[reduced_hash].srchost = srchost_idx;
877 
878 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
879 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
880 		}
881 
882 		if (allocate_dst) {
883 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
884 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
885 			outer_hash = dsthost_idx - inner_hash;
886 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
887 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
888 				if (q->hosts[outer_hash + k].dsthost_tag ==
889 				    dsthost_hash)
890 					goto found_dst;
891 			}
892 			for (i = 0; i < CAKE_SET_WAYS;
893 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
894 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
895 					break;
896 			}
897 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
898 found_dst:
899 			dsthost_idx = outer_hash + k;
900 			q->flows[reduced_hash].dsthost = dsthost_idx;
901 
902 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
903 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
904 		}
905 	}
906 
907 	return reduced_hash;
908 }
909 
910 /* helper functions : might be changed when/if skb use a standard list_head */
911 /* remove one skb from head of slot queue */
912 
dequeue_head(struct cake_flow * flow)913 static struct sk_buff *dequeue_head(struct cake_flow *flow)
914 {
915 	struct sk_buff *skb = flow->head;
916 
917 	if (skb) {
918 		flow->head = skb->next;
919 		skb_mark_not_on_list(skb);
920 	}
921 
922 	return skb;
923 }
924 
925 /* add skb to flow queue (tail add) */
926 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)927 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
928 {
929 	if (!flow->head)
930 		flow->head = skb;
931 	else
932 		flow->tail->next = skb;
933 	flow->tail = skb;
934 	skb->next = NULL;
935 }
936 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)937 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
938 				    struct ipv6hdr *buf)
939 {
940 	unsigned int offset = skb_network_offset(skb);
941 	struct iphdr *iph;
942 
943 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
944 
945 	if (!iph)
946 		return NULL;
947 
948 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
949 		return skb_header_pointer(skb, offset + iph->ihl * 4,
950 					  sizeof(struct ipv6hdr), buf);
951 
952 	else if (iph->version == 4)
953 		return iph;
954 
955 	else if (iph->version == 6)
956 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
957 					  buf);
958 
959 	return NULL;
960 }
961 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)962 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
963 				      void *buf, unsigned int bufsize)
964 {
965 	unsigned int offset = skb_network_offset(skb);
966 	const struct ipv6hdr *ipv6h;
967 	const struct tcphdr *tcph;
968 	const struct iphdr *iph;
969 	struct ipv6hdr _ipv6h;
970 	struct tcphdr _tcph;
971 
972 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
973 
974 	if (!ipv6h)
975 		return NULL;
976 
977 	if (ipv6h->version == 4) {
978 		iph = (struct iphdr *)ipv6h;
979 		offset += iph->ihl * 4;
980 
981 		/* special-case 6in4 tunnelling, as that is a common way to get
982 		 * v6 connectivity in the home
983 		 */
984 		if (iph->protocol == IPPROTO_IPV6) {
985 			ipv6h = skb_header_pointer(skb, offset,
986 						   sizeof(_ipv6h), &_ipv6h);
987 
988 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
989 				return NULL;
990 
991 			offset += sizeof(struct ipv6hdr);
992 
993 		} else if (iph->protocol != IPPROTO_TCP) {
994 			return NULL;
995 		}
996 
997 	} else if (ipv6h->version == 6) {
998 		if (ipv6h->nexthdr != IPPROTO_TCP)
999 			return NULL;
1000 
1001 		offset += sizeof(struct ipv6hdr);
1002 	} else {
1003 		return NULL;
1004 	}
1005 
1006 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1007 	if (!tcph || tcph->doff < 5)
1008 		return NULL;
1009 
1010 	return skb_header_pointer(skb, offset,
1011 				  min(__tcp_hdrlen(tcph), bufsize), buf);
1012 }
1013 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1014 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1015 				   int code, int *oplen)
1016 {
1017 	/* inspired by tcp_parse_options in tcp_input.c */
1018 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1019 	const u8 *ptr = (const u8 *)(tcph + 1);
1020 
1021 	while (length > 0) {
1022 		int opcode = *ptr++;
1023 		int opsize;
1024 
1025 		if (opcode == TCPOPT_EOL)
1026 			break;
1027 		if (opcode == TCPOPT_NOP) {
1028 			length--;
1029 			continue;
1030 		}
1031 		if (length < 2)
1032 			break;
1033 		opsize = *ptr++;
1034 		if (opsize < 2 || opsize > length)
1035 			break;
1036 
1037 		if (opcode == code) {
1038 			*oplen = opsize;
1039 			return ptr;
1040 		}
1041 
1042 		ptr += opsize - 2;
1043 		length -= opsize;
1044 	}
1045 
1046 	return NULL;
1047 }
1048 
1049 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1050  * bytes than the other. In the case where both sequences ACKs bytes that the
1051  * other doesn't, A is considered greater. DSACKs in A also makes A be
1052  * considered greater.
1053  *
1054  * @return -1, 0 or 1 as normal compare functions
1055  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1056 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1057 				  const struct tcphdr *tcph_b)
1058 {
1059 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1060 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1061 	u32 bytes_a = 0, bytes_b = 0;
1062 	int oplen_a, oplen_b;
1063 	bool first = true;
1064 
1065 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1066 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1067 
1068 	/* pointers point to option contents */
1069 	oplen_a -= TCPOLEN_SACK_BASE;
1070 	oplen_b -= TCPOLEN_SACK_BASE;
1071 
1072 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1073 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1074 		return -1;
1075 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1076 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1077 		return 1;
1078 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1079 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1080 		return 0;
1081 
1082 	while (oplen_a >= sizeof(*sack_a)) {
1083 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1084 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1085 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1086 		int oplen_tmp = oplen_b;
1087 		bool found = false;
1088 
1089 		/* DSACK; always considered greater to prevent dropping */
1090 		if (before(start_a, ack_seq_a))
1091 			return -1;
1092 
1093 		bytes_a += end_a - start_a;
1094 
1095 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1096 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1097 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1098 
1099 			/* first time through we count the total size */
1100 			if (first)
1101 				bytes_b += end_b - start_b;
1102 
1103 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1104 				found = true;
1105 				if (!first)
1106 					break;
1107 			}
1108 			oplen_tmp -= sizeof(*sack_tmp);
1109 			sack_tmp++;
1110 		}
1111 
1112 		if (!found)
1113 			return -1;
1114 
1115 		oplen_a -= sizeof(*sack_a);
1116 		sack_a++;
1117 		first = false;
1118 	}
1119 
1120 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1121 	 * either the SACKs are equal, or B SACKs more bytes.
1122 	 */
1123 	return bytes_b > bytes_a ? 1 : 0;
1124 }
1125 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1126 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1127 				 u32 *tsval, u32 *tsecr)
1128 {
1129 	const u8 *ptr;
1130 	int opsize;
1131 
1132 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1133 
1134 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1135 		*tsval = get_unaligned_be32(ptr);
1136 		*tsecr = get_unaligned_be32(ptr + 4);
1137 	}
1138 }
1139 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1140 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1141 			       u32 tstamp_new, u32 tsecr_new)
1142 {
1143 	/* inspired by tcp_parse_options in tcp_input.c */
1144 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1145 	const u8 *ptr = (const u8 *)(tcph + 1);
1146 	u32 tstamp, tsecr;
1147 
1148 	/* 3 reserved flags must be unset to avoid future breakage
1149 	 * ACK must be set
1150 	 * ECE/CWR are handled separately
1151 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1152 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1153 	 * 0x00C00000 = CWR/ECE (handled separately)
1154 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1155 	 */
1156 	if (((tcp_flag_word(tcph) &
1157 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1158 		return false;
1159 
1160 	while (length > 0) {
1161 		int opcode = *ptr++;
1162 		int opsize;
1163 
1164 		if (opcode == TCPOPT_EOL)
1165 			break;
1166 		if (opcode == TCPOPT_NOP) {
1167 			length--;
1168 			continue;
1169 		}
1170 		if (length < 2)
1171 			break;
1172 		opsize = *ptr++;
1173 		if (opsize < 2 || opsize > length)
1174 			break;
1175 
1176 		switch (opcode) {
1177 		case TCPOPT_MD5SIG: /* doesn't influence state */
1178 			break;
1179 
1180 		case TCPOPT_SACK: /* stricter checking performed later */
1181 			if (opsize % 8 != 2)
1182 				return false;
1183 			break;
1184 
1185 		case TCPOPT_TIMESTAMP:
1186 			/* only drop timestamps lower than new */
1187 			if (opsize != TCPOLEN_TIMESTAMP)
1188 				return false;
1189 			tstamp = get_unaligned_be32(ptr);
1190 			tsecr = get_unaligned_be32(ptr + 4);
1191 			if (after(tstamp, tstamp_new) ||
1192 			    after(tsecr, tsecr_new))
1193 				return false;
1194 			break;
1195 
1196 		case TCPOPT_MSS:  /* these should only be set on SYN */
1197 		case TCPOPT_WINDOW:
1198 		case TCPOPT_SACK_PERM:
1199 		case TCPOPT_FASTOPEN:
1200 		case TCPOPT_EXP:
1201 		default: /* don't drop if any unknown options are present */
1202 			return false;
1203 		}
1204 
1205 		ptr += opsize - 2;
1206 		length -= opsize;
1207 	}
1208 
1209 	return true;
1210 }
1211 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1212 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1213 				       struct cake_flow *flow)
1214 {
1215 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1216 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1217 	struct sk_buff *skb_check, *skb_prev = NULL;
1218 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1219 	unsigned char _tcph[64], _tcph_check[64];
1220 	const struct tcphdr *tcph, *tcph_check;
1221 	const struct iphdr *iph, *iph_check;
1222 	struct ipv6hdr _iph, _iph_check;
1223 	const struct sk_buff *skb;
1224 	int seglen, num_found = 0;
1225 	u32 tstamp = 0, tsecr = 0;
1226 	__be32 elig_flags = 0;
1227 	int sack_comp;
1228 
1229 	/* no other possible ACKs to filter */
1230 	if (flow->head == flow->tail)
1231 		return NULL;
1232 
1233 	skb = flow->tail;
1234 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1235 	iph = cake_get_iphdr(skb, &_iph);
1236 	if (!tcph)
1237 		return NULL;
1238 
1239 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1240 
1241 	/* the 'triggering' packet need only have the ACK flag set.
1242 	 * also check that SYN is not set, as there won't be any previous ACKs.
1243 	 */
1244 	if ((tcp_flag_word(tcph) &
1245 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1246 		return NULL;
1247 
1248 	/* the 'triggering' ACK is at the tail of the queue, we have already
1249 	 * returned if it is the only packet in the flow. loop through the rest
1250 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1251 	 * triggering one.
1252 	 */
1253 	for (skb_check = flow->head;
1254 	     skb_check && skb_check != skb;
1255 	     skb_prev = skb_check, skb_check = skb_check->next) {
1256 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1257 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1258 					     sizeof(_tcph_check));
1259 
1260 		/* only TCP packets with matching 5-tuple are eligible, and only
1261 		 * drop safe headers
1262 		 */
1263 		if (!tcph_check || iph->version != iph_check->version ||
1264 		    tcph_check->source != tcph->source ||
1265 		    tcph_check->dest != tcph->dest)
1266 			continue;
1267 
1268 		if (iph_check->version == 4) {
1269 			if (iph_check->saddr != iph->saddr ||
1270 			    iph_check->daddr != iph->daddr)
1271 				continue;
1272 
1273 			seglen = iph_totlen(skb, iph_check) -
1274 				       (4 * iph_check->ihl);
1275 		} else if (iph_check->version == 6) {
1276 			ipv6h = (struct ipv6hdr *)iph;
1277 			ipv6h_check = (struct ipv6hdr *)iph_check;
1278 
1279 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1280 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1281 				continue;
1282 
1283 			seglen = ntohs(ipv6h_check->payload_len);
1284 		} else {
1285 			WARN_ON(1);  /* shouldn't happen */
1286 			continue;
1287 		}
1288 
1289 		/* If the ECE/CWR flags changed from the previous eligible
1290 		 * packet in the same flow, we should no longer be dropping that
1291 		 * previous packet as this would lose information.
1292 		 */
1293 		if (elig_ack && (tcp_flag_word(tcph_check) &
1294 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1295 			elig_ack = NULL;
1296 			elig_ack_prev = NULL;
1297 			num_found--;
1298 		}
1299 
1300 		/* Check TCP options and flags, don't drop ACKs with segment
1301 		 * data, and don't drop ACKs with a higher cumulative ACK
1302 		 * counter than the triggering packet. Check ACK seqno here to
1303 		 * avoid parsing SACK options of packets we are going to exclude
1304 		 * anyway.
1305 		 */
1306 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1307 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1308 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1309 			continue;
1310 
1311 		/* Check SACK options. The triggering packet must SACK more data
1312 		 * than the ACK under consideration, or SACK the same range but
1313 		 * have a larger cumulative ACK counter. The latter is a
1314 		 * pathological case, but is contained in the following check
1315 		 * anyway, just to be safe.
1316 		 */
1317 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1318 
1319 		if (sack_comp < 0 ||
1320 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1321 		     sack_comp == 0))
1322 			continue;
1323 
1324 		/* At this point we have found an eligible pure ACK to drop; if
1325 		 * we are in aggressive mode, we are done. Otherwise, keep
1326 		 * searching unless this is the second eligible ACK we
1327 		 * found.
1328 		 *
1329 		 * Since we want to drop ACK closest to the head of the queue,
1330 		 * save the first eligible ACK we find, even if we need to loop
1331 		 * again.
1332 		 */
1333 		if (!elig_ack) {
1334 			elig_ack = skb_check;
1335 			elig_ack_prev = skb_prev;
1336 			elig_flags = (tcp_flag_word(tcph_check)
1337 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1338 		}
1339 
1340 		if (num_found++ > 0)
1341 			goto found;
1342 	}
1343 
1344 	/* We made it through the queue without finding two eligible ACKs . If
1345 	 * we found a single eligible ACK we can drop it in aggressive mode if
1346 	 * we can guarantee that this does not interfere with ECN flag
1347 	 * information. We ensure this by dropping it only if the enqueued
1348 	 * packet is consecutive with the eligible ACK, and their flags match.
1349 	 */
1350 	if (elig_ack && aggressive && elig_ack->next == skb &&
1351 	    (elig_flags == (tcp_flag_word(tcph) &
1352 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1353 		goto found;
1354 
1355 	return NULL;
1356 
1357 found:
1358 	if (elig_ack_prev)
1359 		elig_ack_prev->next = elig_ack->next;
1360 	else
1361 		flow->head = elig_ack->next;
1362 
1363 	skb_mark_not_on_list(elig_ack);
1364 
1365 	return elig_ack;
1366 }
1367 
cake_ewma(u64 avg,u64 sample,u32 shift)1368 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1369 {
1370 	avg -= avg >> shift;
1371 	avg += sample >> shift;
1372 	return avg;
1373 }
1374 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1375 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1376 {
1377 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1378 		len -= off;
1379 
1380 	if (q->max_netlen < len)
1381 		q->max_netlen = len;
1382 	if (q->min_netlen > len)
1383 		q->min_netlen = len;
1384 
1385 	len += q->rate_overhead;
1386 
1387 	if (len < q->rate_mpu)
1388 		len = q->rate_mpu;
1389 
1390 	if (q->atm_mode == CAKE_ATM_ATM) {
1391 		len += 47;
1392 		len /= 48;
1393 		len *= 53;
1394 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1395 		/* Add one byte per 64 bytes or part thereof.
1396 		 * This is conservative and easier to calculate than the
1397 		 * precise value.
1398 		 */
1399 		len += (len + 63) / 64;
1400 	}
1401 
1402 	if (q->max_adjlen < len)
1403 		q->max_adjlen = len;
1404 	if (q->min_adjlen > len)
1405 		q->min_adjlen = len;
1406 
1407 	return len;
1408 }
1409 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1410 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1411 {
1412 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1413 	unsigned int hdr_len, last_len = 0;
1414 	u32 off = skb_network_offset(skb);
1415 	u32 len = qdisc_pkt_len(skb);
1416 	u16 segs = 1;
1417 
1418 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1419 
1420 	if (!shinfo->gso_size)
1421 		return cake_calc_overhead(q, len, off);
1422 
1423 	/* borrowed from qdisc_pkt_len_init() */
1424 	hdr_len = skb_transport_offset(skb);
1425 
1426 	/* + transport layer */
1427 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1428 						SKB_GSO_TCPV6))) {
1429 		const struct tcphdr *th;
1430 		struct tcphdr _tcphdr;
1431 
1432 		th = skb_header_pointer(skb, hdr_len,
1433 					sizeof(_tcphdr), &_tcphdr);
1434 		if (likely(th))
1435 			hdr_len += __tcp_hdrlen(th);
1436 	} else {
1437 		struct udphdr _udphdr;
1438 
1439 		if (skb_header_pointer(skb, hdr_len,
1440 				       sizeof(_udphdr), &_udphdr))
1441 			hdr_len += sizeof(struct udphdr);
1442 	}
1443 
1444 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1445 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1446 				    shinfo->gso_size);
1447 	else
1448 		segs = shinfo->gso_segs;
1449 
1450 	len = shinfo->gso_size + hdr_len;
1451 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1452 
1453 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1454 		cake_calc_overhead(q, last_len, off));
1455 }
1456 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1457 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1458 {
1459 	struct cake_heap_entry ii = q->overflow_heap[i];
1460 	struct cake_heap_entry jj = q->overflow_heap[j];
1461 
1462 	q->overflow_heap[i] = jj;
1463 	q->overflow_heap[j] = ii;
1464 
1465 	q->tins[ii.t].overflow_idx[ii.b] = j;
1466 	q->tins[jj.t].overflow_idx[jj.b] = i;
1467 }
1468 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1469 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1470 {
1471 	struct cake_heap_entry ii = q->overflow_heap[i];
1472 
1473 	return q->tins[ii.t].backlogs[ii.b];
1474 }
1475 
cake_heapify(struct cake_sched_data * q,u16 i)1476 static void cake_heapify(struct cake_sched_data *q, u16 i)
1477 {
1478 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1479 	u32 mb = cake_heap_get_backlog(q, i);
1480 	u32 m = i;
1481 
1482 	while (m < a) {
1483 		u32 l = m + m + 1;
1484 		u32 r = l + 1;
1485 
1486 		if (l < a) {
1487 			u32 lb = cake_heap_get_backlog(q, l);
1488 
1489 			if (lb > mb) {
1490 				m  = l;
1491 				mb = lb;
1492 			}
1493 		}
1494 
1495 		if (r < a) {
1496 			u32 rb = cake_heap_get_backlog(q, r);
1497 
1498 			if (rb > mb) {
1499 				m  = r;
1500 				mb = rb;
1501 			}
1502 		}
1503 
1504 		if (m != i) {
1505 			cake_heap_swap(q, i, m);
1506 			i = m;
1507 		} else {
1508 			break;
1509 		}
1510 	}
1511 }
1512 
cake_heapify_up(struct cake_sched_data * q,u16 i)1513 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1514 {
1515 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1516 		u16 p = (i - 1) >> 1;
1517 		u32 ib = cake_heap_get_backlog(q, i);
1518 		u32 pb = cake_heap_get_backlog(q, p);
1519 
1520 		if (ib > pb) {
1521 			cake_heap_swap(q, i, p);
1522 			i = p;
1523 		} else {
1524 			break;
1525 		}
1526 	}
1527 }
1528 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1529 static int cake_advance_shaper(struct cake_sched_data *q,
1530 			       struct cake_tin_data *b,
1531 			       struct sk_buff *skb,
1532 			       ktime_t now, bool drop)
1533 {
1534 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1535 
1536 	/* charge packet bandwidth to this tin
1537 	 * and to the global shaper.
1538 	 */
1539 	if (q->rate_ns) {
1540 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1541 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1542 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1543 
1544 		if (ktime_before(b->time_next_packet, now))
1545 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1546 							   tin_dur);
1547 
1548 		else if (ktime_before(b->time_next_packet,
1549 				      ktime_add_ns(now, tin_dur)))
1550 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1551 
1552 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1553 						   global_dur);
1554 		if (!drop)
1555 			q->failsafe_next_packet = \
1556 				ktime_add_ns(q->failsafe_next_packet,
1557 					     failsafe_dur);
1558 	}
1559 	return len;
1560 }
1561 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1562 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1563 {
1564 	struct cake_sched_data *q = qdisc_priv(sch);
1565 	ktime_t now = ktime_get();
1566 	u32 idx = 0, tin = 0, len;
1567 	struct cake_heap_entry qq;
1568 	struct cake_tin_data *b;
1569 	struct cake_flow *flow;
1570 	struct sk_buff *skb;
1571 
1572 	if (!q->overflow_timeout) {
1573 		int i;
1574 		/* Build fresh max-heap */
1575 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1576 			cake_heapify(q, i);
1577 	}
1578 	q->overflow_timeout = 65535;
1579 
1580 	/* select longest queue for pruning */
1581 	qq  = q->overflow_heap[0];
1582 	tin = qq.t;
1583 	idx = qq.b;
1584 
1585 	b = &q->tins[tin];
1586 	flow = &b->flows[idx];
1587 	skb = dequeue_head(flow);
1588 	if (unlikely(!skb)) {
1589 		/* heap has gone wrong, rebuild it next time */
1590 		q->overflow_timeout = 0;
1591 		return idx + (tin << 16);
1592 	}
1593 
1594 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1595 		b->unresponsive_flow_count++;
1596 
1597 	len = qdisc_pkt_len(skb);
1598 	q->buffer_used      -= skb->truesize;
1599 	b->backlogs[idx]    -= len;
1600 	b->tin_backlog      -= len;
1601 	sch->qstats.backlog -= len;
1602 
1603 	flow->dropped++;
1604 	b->tin_dropped++;
1605 	sch->qstats.drops++;
1606 
1607 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1608 		cake_advance_shaper(q, b, skb, now, true);
1609 
1610 	__qdisc_drop(skb, to_free);
1611 	sch->q.qlen--;
1612 	qdisc_tree_reduce_backlog(sch, 1, len);
1613 
1614 	cake_heapify(q, 0);
1615 
1616 	return idx + (tin << 16);
1617 }
1618 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1619 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1620 {
1621 	const int offset = skb_network_offset(skb);
1622 	u16 *buf, buf_;
1623 	u8 dscp;
1624 
1625 	switch (skb_protocol(skb, true)) {
1626 	case htons(ETH_P_IP):
1627 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1628 		if (unlikely(!buf))
1629 			return 0;
1630 
1631 		/* ToS is in the second byte of iphdr */
1632 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1633 
1634 		if (wash && dscp) {
1635 			const int wlen = offset + sizeof(struct iphdr);
1636 
1637 			if (!pskb_may_pull(skb, wlen) ||
1638 			    skb_try_make_writable(skb, wlen))
1639 				return 0;
1640 
1641 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1642 		}
1643 
1644 		return dscp;
1645 
1646 	case htons(ETH_P_IPV6):
1647 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1648 		if (unlikely(!buf))
1649 			return 0;
1650 
1651 		/* Traffic class is in the first and second bytes of ipv6hdr */
1652 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1653 
1654 		if (wash && dscp) {
1655 			const int wlen = offset + sizeof(struct ipv6hdr);
1656 
1657 			if (!pskb_may_pull(skb, wlen) ||
1658 			    skb_try_make_writable(skb, wlen))
1659 				return 0;
1660 
1661 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1662 		}
1663 
1664 		return dscp;
1665 
1666 	case htons(ETH_P_ARP):
1667 		return 0x38;  /* CS7 - Net Control */
1668 
1669 	default:
1670 		/* If there is no Diffserv field, treat as best-effort */
1671 		return 0;
1672 	}
1673 }
1674 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1675 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1676 					     struct sk_buff *skb)
1677 {
1678 	struct cake_sched_data *q = qdisc_priv(sch);
1679 	u32 tin, mark;
1680 	bool wash;
1681 	u8 dscp;
1682 
1683 	/* Tin selection: Default to diffserv-based selection, allow overriding
1684 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1685 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1686 	 */
1687 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1688 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1689 	if (wash)
1690 		dscp = cake_handle_diffserv(skb, wash);
1691 
1692 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1693 		tin = 0;
1694 
1695 	else if (mark && mark <= q->tin_cnt)
1696 		tin = q->tin_order[mark - 1];
1697 
1698 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1699 		 TC_H_MIN(skb->priority) > 0 &&
1700 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1701 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1702 
1703 	else {
1704 		if (!wash)
1705 			dscp = cake_handle_diffserv(skb, wash);
1706 		tin = q->tin_index[dscp];
1707 
1708 		if (unlikely(tin >= q->tin_cnt))
1709 			tin = 0;
1710 	}
1711 
1712 	return &q->tins[tin];
1713 }
1714 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1715 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1716 			 struct sk_buff *skb, int flow_mode, int *qerr)
1717 {
1718 	struct cake_sched_data *q = qdisc_priv(sch);
1719 	struct tcf_proto *filter;
1720 	struct tcf_result res;
1721 	u16 flow = 0, host = 0;
1722 	int result;
1723 
1724 	filter = rcu_dereference_bh(q->filter_list);
1725 	if (!filter)
1726 		goto hash;
1727 
1728 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1729 	result = tcf_classify(skb, NULL, filter, &res, false);
1730 
1731 	if (result >= 0) {
1732 #ifdef CONFIG_NET_CLS_ACT
1733 		switch (result) {
1734 		case TC_ACT_STOLEN:
1735 		case TC_ACT_QUEUED:
1736 		case TC_ACT_TRAP:
1737 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1738 			fallthrough;
1739 		case TC_ACT_SHOT:
1740 			return 0;
1741 		}
1742 #endif
1743 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1744 			flow = TC_H_MIN(res.classid);
1745 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1746 			host = TC_H_MAJ(res.classid) >> 16;
1747 	}
1748 hash:
1749 	*t = cake_select_tin(sch, skb);
1750 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1751 }
1752 
1753 static void cake_reconfigure(struct Qdisc *sch);
1754 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1755 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1756 			struct sk_buff **to_free)
1757 {
1758 	struct cake_sched_data *q = qdisc_priv(sch);
1759 	int len = qdisc_pkt_len(skb);
1760 	int ret;
1761 	struct sk_buff *ack = NULL;
1762 	ktime_t now = ktime_get();
1763 	struct cake_tin_data *b;
1764 	struct cake_flow *flow;
1765 	u32 idx;
1766 
1767 	/* choose flow to insert into */
1768 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1769 	if (idx == 0) {
1770 		if (ret & __NET_XMIT_BYPASS)
1771 			qdisc_qstats_drop(sch);
1772 		__qdisc_drop(skb, to_free);
1773 		return ret;
1774 	}
1775 	idx--;
1776 	flow = &b->flows[idx];
1777 
1778 	/* ensure shaper state isn't stale */
1779 	if (!b->tin_backlog) {
1780 		if (ktime_before(b->time_next_packet, now))
1781 			b->time_next_packet = now;
1782 
1783 		if (!sch->q.qlen) {
1784 			if (ktime_before(q->time_next_packet, now)) {
1785 				q->failsafe_next_packet = now;
1786 				q->time_next_packet = now;
1787 			} else if (ktime_after(q->time_next_packet, now) &&
1788 				   ktime_after(q->failsafe_next_packet, now)) {
1789 				u64 next = \
1790 					min(ktime_to_ns(q->time_next_packet),
1791 					    ktime_to_ns(
1792 						   q->failsafe_next_packet));
1793 				sch->qstats.overlimits++;
1794 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1795 			}
1796 		}
1797 	}
1798 
1799 	if (unlikely(len > b->max_skblen))
1800 		b->max_skblen = len;
1801 
1802 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1803 		struct sk_buff *segs, *nskb;
1804 		netdev_features_t features = netif_skb_features(skb);
1805 		unsigned int slen = 0, numsegs = 0;
1806 
1807 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1808 		if (IS_ERR_OR_NULL(segs))
1809 			return qdisc_drop(skb, sch, to_free);
1810 
1811 		skb_list_walk_safe(segs, segs, nskb) {
1812 			skb_mark_not_on_list(segs);
1813 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1814 			cobalt_set_enqueue_time(segs, now);
1815 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1816 									  segs);
1817 			flow_queue_add(flow, segs);
1818 
1819 			sch->q.qlen++;
1820 			numsegs++;
1821 			slen += segs->len;
1822 			q->buffer_used += segs->truesize;
1823 			b->packets++;
1824 		}
1825 
1826 		/* stats */
1827 		b->bytes	    += slen;
1828 		b->backlogs[idx]    += slen;
1829 		b->tin_backlog      += slen;
1830 		sch->qstats.backlog += slen;
1831 		q->avg_window_bytes += slen;
1832 
1833 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1834 		consume_skb(skb);
1835 	} else {
1836 		/* not splitting */
1837 		cobalt_set_enqueue_time(skb, now);
1838 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1839 		flow_queue_add(flow, skb);
1840 
1841 		if (q->ack_filter)
1842 			ack = cake_ack_filter(q, flow);
1843 
1844 		if (ack) {
1845 			b->ack_drops++;
1846 			sch->qstats.drops++;
1847 			b->bytes += qdisc_pkt_len(ack);
1848 			len -= qdisc_pkt_len(ack);
1849 			q->buffer_used += skb->truesize - ack->truesize;
1850 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1851 				cake_advance_shaper(q, b, ack, now, true);
1852 
1853 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1854 			consume_skb(ack);
1855 		} else {
1856 			sch->q.qlen++;
1857 			q->buffer_used      += skb->truesize;
1858 		}
1859 
1860 		/* stats */
1861 		b->packets++;
1862 		b->bytes	    += len;
1863 		b->backlogs[idx]    += len;
1864 		b->tin_backlog      += len;
1865 		sch->qstats.backlog += len;
1866 		q->avg_window_bytes += len;
1867 	}
1868 
1869 	if (q->overflow_timeout)
1870 		cake_heapify_up(q, b->overflow_idx[idx]);
1871 
1872 	/* incoming bandwidth capacity estimate */
1873 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1874 		u64 packet_interval = \
1875 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1876 
1877 		if (packet_interval > NSEC_PER_SEC)
1878 			packet_interval = NSEC_PER_SEC;
1879 
1880 		/* filter out short-term bursts, eg. wifi aggregation */
1881 		q->avg_packet_interval = \
1882 			cake_ewma(q->avg_packet_interval,
1883 				  packet_interval,
1884 				  (packet_interval > q->avg_packet_interval ?
1885 					  2 : 8));
1886 
1887 		q->last_packet_time = now;
1888 
1889 		if (packet_interval > q->avg_packet_interval) {
1890 			u64 window_interval = \
1891 				ktime_to_ns(ktime_sub(now,
1892 						      q->avg_window_begin));
1893 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1894 
1895 			b = div64_u64(b, window_interval);
1896 			q->avg_peak_bandwidth =
1897 				cake_ewma(q->avg_peak_bandwidth, b,
1898 					  b > q->avg_peak_bandwidth ? 2 : 8);
1899 			q->avg_window_bytes = 0;
1900 			q->avg_window_begin = now;
1901 
1902 			if (ktime_after(now,
1903 					ktime_add_ms(q->last_reconfig_time,
1904 						     250))) {
1905 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1906 				cake_reconfigure(sch);
1907 			}
1908 		}
1909 	} else {
1910 		q->avg_window_bytes = 0;
1911 		q->last_packet_time = now;
1912 	}
1913 
1914 	/* flowchain */
1915 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1916 		if (!flow->set) {
1917 			list_add_tail(&flow->flowchain, &b->new_flows);
1918 		} else {
1919 			b->decaying_flow_count--;
1920 			list_move_tail(&flow->flowchain, &b->new_flows);
1921 		}
1922 		flow->set = CAKE_SET_SPARSE;
1923 		b->sparse_flow_count++;
1924 
1925 		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1926 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1927 		/* this flow was empty, accounted as a sparse flow, but actually
1928 		 * in the bulk rotation.
1929 		 */
1930 		flow->set = CAKE_SET_BULK;
1931 		b->sparse_flow_count--;
1932 		b->bulk_flow_count++;
1933 
1934 		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1935 		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1936 	}
1937 
1938 	if (q->buffer_used > q->buffer_max_used)
1939 		q->buffer_max_used = q->buffer_used;
1940 
1941 	if (q->buffer_used > q->buffer_limit) {
1942 		u32 dropped = 0;
1943 
1944 		while (q->buffer_used > q->buffer_limit) {
1945 			dropped++;
1946 			cake_drop(sch, to_free);
1947 		}
1948 		b->drop_overlimit += dropped;
1949 	}
1950 	return NET_XMIT_SUCCESS;
1951 }
1952 
cake_dequeue_one(struct Qdisc * sch)1953 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1954 {
1955 	struct cake_sched_data *q = qdisc_priv(sch);
1956 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1957 	struct cake_flow *flow = &b->flows[q->cur_flow];
1958 	struct sk_buff *skb = NULL;
1959 	u32 len;
1960 
1961 	if (flow->head) {
1962 		skb = dequeue_head(flow);
1963 		len = qdisc_pkt_len(skb);
1964 		b->backlogs[q->cur_flow] -= len;
1965 		b->tin_backlog		 -= len;
1966 		sch->qstats.backlog      -= len;
1967 		q->buffer_used		 -= skb->truesize;
1968 		sch->q.qlen--;
1969 
1970 		if (q->overflow_timeout)
1971 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1972 	}
1973 	return skb;
1974 }
1975 
1976 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1977 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1978 {
1979 	struct cake_sched_data *q = qdisc_priv(sch);
1980 	struct sk_buff *skb;
1981 
1982 	q->cur_tin = tin;
1983 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1984 		while (!!(skb = cake_dequeue_one(sch)))
1985 			kfree_skb(skb);
1986 }
1987 
cake_dequeue(struct Qdisc * sch)1988 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1989 {
1990 	struct cake_sched_data *q = qdisc_priv(sch);
1991 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1992 	ktime_t now = ktime_get();
1993 	struct cake_flow *flow;
1994 	struct list_head *head;
1995 	bool first_flow = true;
1996 	struct sk_buff *skb;
1997 	u64 delay;
1998 	u32 len;
1999 
2000 begin:
2001 	if (!sch->q.qlen)
2002 		return NULL;
2003 
2004 	/* global hard shaper */
2005 	if (ktime_after(q->time_next_packet, now) &&
2006 	    ktime_after(q->failsafe_next_packet, now)) {
2007 		u64 next = min(ktime_to_ns(q->time_next_packet),
2008 			       ktime_to_ns(q->failsafe_next_packet));
2009 
2010 		sch->qstats.overlimits++;
2011 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2012 		return NULL;
2013 	}
2014 
2015 	/* Choose a class to work on. */
2016 	if (!q->rate_ns) {
2017 		/* In unlimited mode, can't rely on shaper timings, just balance
2018 		 * with DRR
2019 		 */
2020 		bool wrapped = false, empty = true;
2021 
2022 		while (b->tin_deficit < 0 ||
2023 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2024 			if (b->tin_deficit <= 0)
2025 				b->tin_deficit += b->tin_quantum;
2026 			if (b->sparse_flow_count + b->bulk_flow_count)
2027 				empty = false;
2028 
2029 			q->cur_tin++;
2030 			b++;
2031 			if (q->cur_tin >= q->tin_cnt) {
2032 				q->cur_tin = 0;
2033 				b = q->tins;
2034 
2035 				if (wrapped) {
2036 					/* It's possible for q->qlen to be
2037 					 * nonzero when we actually have no
2038 					 * packets anywhere.
2039 					 */
2040 					if (empty)
2041 						return NULL;
2042 				} else {
2043 					wrapped = true;
2044 				}
2045 			}
2046 		}
2047 	} else {
2048 		/* In shaped mode, choose:
2049 		 * - Highest-priority tin with queue and meeting schedule, or
2050 		 * - The earliest-scheduled tin with queue.
2051 		 */
2052 		ktime_t best_time = KTIME_MAX;
2053 		int tin, best_tin = 0;
2054 
2055 		for (tin = 0; tin < q->tin_cnt; tin++) {
2056 			b = q->tins + tin;
2057 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2058 				ktime_t time_to_pkt = \
2059 					ktime_sub(b->time_next_packet, now);
2060 
2061 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2062 				    ktime_compare(time_to_pkt,
2063 						  best_time) <= 0) {
2064 					best_time = time_to_pkt;
2065 					best_tin = tin;
2066 				}
2067 			}
2068 		}
2069 
2070 		q->cur_tin = best_tin;
2071 		b = q->tins + best_tin;
2072 
2073 		/* No point in going further if no packets to deliver. */
2074 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2075 			return NULL;
2076 	}
2077 
2078 retry:
2079 	/* service this class */
2080 	head = &b->decaying_flows;
2081 	if (!first_flow || list_empty(head)) {
2082 		head = &b->new_flows;
2083 		if (list_empty(head)) {
2084 			head = &b->old_flows;
2085 			if (unlikely(list_empty(head))) {
2086 				head = &b->decaying_flows;
2087 				if (unlikely(list_empty(head)))
2088 					goto begin;
2089 			}
2090 		}
2091 	}
2092 	flow = list_first_entry(head, struct cake_flow, flowchain);
2093 	q->cur_flow = flow - b->flows;
2094 	first_flow = false;
2095 
2096 	/* flow isolation (DRR++) */
2097 	if (flow->deficit <= 0) {
2098 		/* Keep all flows with deficits out of the sparse and decaying
2099 		 * rotations.  No non-empty flow can go into the decaying
2100 		 * rotation, so they can't get deficits
2101 		 */
2102 		if (flow->set == CAKE_SET_SPARSE) {
2103 			if (flow->head) {
2104 				b->sparse_flow_count--;
2105 				b->bulk_flow_count++;
2106 
2107 				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2108 				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2109 
2110 				flow->set = CAKE_SET_BULK;
2111 			} else {
2112 				/* we've moved it to the bulk rotation for
2113 				 * correct deficit accounting but we still want
2114 				 * to count it as a sparse flow, not a bulk one.
2115 				 */
2116 				flow->set = CAKE_SET_SPARSE_WAIT;
2117 			}
2118 		}
2119 
2120 		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2121 		list_move_tail(&flow->flowchain, &b->old_flows);
2122 
2123 		goto retry;
2124 	}
2125 
2126 	/* Retrieve a packet via the AQM */
2127 	while (1) {
2128 		skb = cake_dequeue_one(sch);
2129 		if (!skb) {
2130 			/* this queue was actually empty */
2131 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2132 				b->unresponsive_flow_count--;
2133 
2134 			if (flow->cvars.p_drop || flow->cvars.count ||
2135 			    ktime_before(now, flow->cvars.drop_next)) {
2136 				/* keep in the flowchain until the state has
2137 				 * decayed to rest
2138 				 */
2139 				list_move_tail(&flow->flowchain,
2140 					       &b->decaying_flows);
2141 				if (flow->set == CAKE_SET_BULK) {
2142 					b->bulk_flow_count--;
2143 
2144 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2145 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2146 
2147 					b->decaying_flow_count++;
2148 				} else if (flow->set == CAKE_SET_SPARSE ||
2149 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2150 					b->sparse_flow_count--;
2151 					b->decaying_flow_count++;
2152 				}
2153 				flow->set = CAKE_SET_DECAYING;
2154 			} else {
2155 				/* remove empty queue from the flowchain */
2156 				list_del_init(&flow->flowchain);
2157 				if (flow->set == CAKE_SET_SPARSE ||
2158 				    flow->set == CAKE_SET_SPARSE_WAIT)
2159 					b->sparse_flow_count--;
2160 				else if (flow->set == CAKE_SET_BULK) {
2161 					b->bulk_flow_count--;
2162 
2163 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2164 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2165 				} else
2166 					b->decaying_flow_count--;
2167 
2168 				flow->set = CAKE_SET_NONE;
2169 			}
2170 			goto begin;
2171 		}
2172 
2173 		/* Last packet in queue may be marked, shouldn't be dropped */
2174 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2175 					(b->bulk_flow_count *
2176 					 !!(q->rate_flags &
2177 					    CAKE_FLAG_INGRESS))) ||
2178 		    !flow->head)
2179 			break;
2180 
2181 		/* drop this packet, get another one */
2182 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2183 			len = cake_advance_shaper(q, b, skb,
2184 						  now, true);
2185 			flow->deficit -= len;
2186 			b->tin_deficit -= len;
2187 		}
2188 		flow->dropped++;
2189 		b->tin_dropped++;
2190 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2191 		qdisc_qstats_drop(sch);
2192 		kfree_skb(skb);
2193 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2194 			goto retry;
2195 	}
2196 
2197 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2198 	qdisc_bstats_update(sch, skb);
2199 
2200 	/* collect delay stats */
2201 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2202 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2203 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2204 				  delay > b->peak_delay ? 2 : 8);
2205 	b->base_delay = cake_ewma(b->base_delay, delay,
2206 				  delay < b->base_delay ? 2 : 8);
2207 
2208 	len = cake_advance_shaper(q, b, skb, now, false);
2209 	flow->deficit -= len;
2210 	b->tin_deficit -= len;
2211 
2212 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2213 		u64 next = min(ktime_to_ns(q->time_next_packet),
2214 			       ktime_to_ns(q->failsafe_next_packet));
2215 
2216 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2217 	} else if (!sch->q.qlen) {
2218 		int i;
2219 
2220 		for (i = 0; i < q->tin_cnt; i++) {
2221 			if (q->tins[i].decaying_flow_count) {
2222 				ktime_t next = \
2223 					ktime_add_ns(now,
2224 						     q->tins[i].cparams.target);
2225 
2226 				qdisc_watchdog_schedule_ns(&q->watchdog,
2227 							   ktime_to_ns(next));
2228 				break;
2229 			}
2230 		}
2231 	}
2232 
2233 	if (q->overflow_timeout)
2234 		q->overflow_timeout--;
2235 
2236 	return skb;
2237 }
2238 
cake_reset(struct Qdisc * sch)2239 static void cake_reset(struct Qdisc *sch)
2240 {
2241 	struct cake_sched_data *q = qdisc_priv(sch);
2242 	u32 c;
2243 
2244 	if (!q->tins)
2245 		return;
2246 
2247 	for (c = 0; c < CAKE_MAX_TINS; c++)
2248 		cake_clear_tin(sch, c);
2249 }
2250 
2251 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2252 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2253 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2254 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2255 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2256 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2257 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2258 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2259 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2260 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2261 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2262 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2263 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2264 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2265 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2266 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2267 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2268 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2269 };
2270 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2271 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2272 			  u64 target_ns, u64 rtt_est_ns)
2273 {
2274 	/* convert byte-rate into time-per-byte
2275 	 * so it will always unwedge in reasonable time.
2276 	 */
2277 	static const u64 MIN_RATE = 64;
2278 	u32 byte_target = mtu;
2279 	u64 byte_target_ns;
2280 	u8  rate_shft = 0;
2281 	u64 rate_ns = 0;
2282 
2283 	b->flow_quantum = 1514;
2284 	if (rate) {
2285 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2286 		rate_shft = 34;
2287 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2288 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2289 		while (!!(rate_ns >> 34)) {
2290 			rate_ns >>= 1;
2291 			rate_shft--;
2292 		}
2293 	} /* else unlimited, ie. zero delay */
2294 
2295 	b->tin_rate_bps  = rate;
2296 	b->tin_rate_ns   = rate_ns;
2297 	b->tin_rate_shft = rate_shft;
2298 
2299 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2300 
2301 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2302 	b->cparams.interval = max(rtt_est_ns +
2303 				     b->cparams.target - target_ns,
2304 				     b->cparams.target * 2);
2305 	b->cparams.mtu_time = byte_target_ns;
2306 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2307 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2308 }
2309 
cake_config_besteffort(struct Qdisc * sch)2310 static int cake_config_besteffort(struct Qdisc *sch)
2311 {
2312 	struct cake_sched_data *q = qdisc_priv(sch);
2313 	struct cake_tin_data *b = &q->tins[0];
2314 	u32 mtu = psched_mtu(qdisc_dev(sch));
2315 	u64 rate = q->rate_bps;
2316 
2317 	q->tin_cnt = 1;
2318 
2319 	q->tin_index = besteffort;
2320 	q->tin_order = normal_order;
2321 
2322 	cake_set_rate(b, rate, mtu,
2323 		      us_to_ns(q->target), us_to_ns(q->interval));
2324 	b->tin_quantum = 65535;
2325 
2326 	return 0;
2327 }
2328 
cake_config_precedence(struct Qdisc * sch)2329 static int cake_config_precedence(struct Qdisc *sch)
2330 {
2331 	/* convert high-level (user visible) parameters into internal format */
2332 	struct cake_sched_data *q = qdisc_priv(sch);
2333 	u32 mtu = psched_mtu(qdisc_dev(sch));
2334 	u64 rate = q->rate_bps;
2335 	u32 quantum = 256;
2336 	u32 i;
2337 
2338 	q->tin_cnt = 8;
2339 	q->tin_index = precedence;
2340 	q->tin_order = normal_order;
2341 
2342 	for (i = 0; i < q->tin_cnt; i++) {
2343 		struct cake_tin_data *b = &q->tins[i];
2344 
2345 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2346 			      us_to_ns(q->interval));
2347 
2348 		b->tin_quantum = max_t(u16, 1U, quantum);
2349 
2350 		/* calculate next class's parameters */
2351 		rate  *= 7;
2352 		rate >>= 3;
2353 
2354 		quantum  *= 7;
2355 		quantum >>= 3;
2356 	}
2357 
2358 	return 0;
2359 }
2360 
2361 /*	List of known Diffserv codepoints:
2362  *
2363  *	Default Forwarding (DF/CS0) - Best Effort
2364  *	Max Throughput (TOS2)
2365  *	Min Delay (TOS4)
2366  *	LLT "La" (TOS5)
2367  *	Assured Forwarding 1 (AF1x) - x3
2368  *	Assured Forwarding 2 (AF2x) - x3
2369  *	Assured Forwarding 3 (AF3x) - x3
2370  *	Assured Forwarding 4 (AF4x) - x3
2371  *	Precedence Class 1 (CS1)
2372  *	Precedence Class 2 (CS2)
2373  *	Precedence Class 3 (CS3)
2374  *	Precedence Class 4 (CS4)
2375  *	Precedence Class 5 (CS5)
2376  *	Precedence Class 6 (CS6)
2377  *	Precedence Class 7 (CS7)
2378  *	Voice Admit (VA)
2379  *	Expedited Forwarding (EF)
2380  *	Lower Effort (LE)
2381  *
2382  *	Total 26 codepoints.
2383  */
2384 
2385 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2386  *		(roughly descending order of contended priority)
2387  *		(roughly ascending order of uncontended throughput)
2388  *
2389  *	Network Control (CS6,CS7)      - routing traffic
2390  *	Telephony (EF,VA)         - aka. VoIP streams
2391  *	Signalling (CS5)               - VoIP setup
2392  *	Multimedia Conferencing (AF4x) - aka. video calls
2393  *	Realtime Interactive (CS4)     - eg. games
2394  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2395  *	Broadcast Video (CS3)
2396  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2397  *	Ops, Admin, Management (CS2)      - eg. ssh
2398  *	Standard Service (DF & unrecognised codepoints)
2399  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2400  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2401  *
2402  *	Total 12 traffic classes.
2403  */
2404 
cake_config_diffserv8(struct Qdisc * sch)2405 static int cake_config_diffserv8(struct Qdisc *sch)
2406 {
2407 /*	Pruned list of traffic classes for typical applications:
2408  *
2409  *		Network Control          (CS6, CS7)
2410  *		Minimum Latency          (EF, VA, CS5, CS4)
2411  *		Interactive Shell        (CS2)
2412  *		Low Latency Transactions (AF2x, TOS4)
2413  *		Video Streaming          (AF4x, AF3x, CS3)
2414  *		Bog Standard             (DF etc.)
2415  *		High Throughput          (AF1x, TOS2, CS1)
2416  *		Background Traffic       (LE)
2417  *
2418  *		Total 8 traffic classes.
2419  */
2420 
2421 	struct cake_sched_data *q = qdisc_priv(sch);
2422 	u32 mtu = psched_mtu(qdisc_dev(sch));
2423 	u64 rate = q->rate_bps;
2424 	u32 quantum = 256;
2425 	u32 i;
2426 
2427 	q->tin_cnt = 8;
2428 
2429 	/* codepoint to class mapping */
2430 	q->tin_index = diffserv8;
2431 	q->tin_order = normal_order;
2432 
2433 	/* class characteristics */
2434 	for (i = 0; i < q->tin_cnt; i++) {
2435 		struct cake_tin_data *b = &q->tins[i];
2436 
2437 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2438 			      us_to_ns(q->interval));
2439 
2440 		b->tin_quantum = max_t(u16, 1U, quantum);
2441 
2442 		/* calculate next class's parameters */
2443 		rate  *= 7;
2444 		rate >>= 3;
2445 
2446 		quantum  *= 7;
2447 		quantum >>= 3;
2448 	}
2449 
2450 	return 0;
2451 }
2452 
cake_config_diffserv4(struct Qdisc * sch)2453 static int cake_config_diffserv4(struct Qdisc *sch)
2454 {
2455 /*  Further pruned list of traffic classes for four-class system:
2456  *
2457  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2458  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2459  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2460  *	    Background Traffic (LE, CS1)
2461  *
2462  *		Total 4 traffic classes.
2463  */
2464 
2465 	struct cake_sched_data *q = qdisc_priv(sch);
2466 	u32 mtu = psched_mtu(qdisc_dev(sch));
2467 	u64 rate = q->rate_bps;
2468 	u32 quantum = 1024;
2469 
2470 	q->tin_cnt = 4;
2471 
2472 	/* codepoint to class mapping */
2473 	q->tin_index = diffserv4;
2474 	q->tin_order = bulk_order;
2475 
2476 	/* class characteristics */
2477 	cake_set_rate(&q->tins[0], rate, mtu,
2478 		      us_to_ns(q->target), us_to_ns(q->interval));
2479 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2480 		      us_to_ns(q->target), us_to_ns(q->interval));
2481 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2482 		      us_to_ns(q->target), us_to_ns(q->interval));
2483 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2484 		      us_to_ns(q->target), us_to_ns(q->interval));
2485 
2486 	/* bandwidth-sharing weights */
2487 	q->tins[0].tin_quantum = quantum;
2488 	q->tins[1].tin_quantum = quantum >> 4;
2489 	q->tins[2].tin_quantum = quantum >> 1;
2490 	q->tins[3].tin_quantum = quantum >> 2;
2491 
2492 	return 0;
2493 }
2494 
cake_config_diffserv3(struct Qdisc * sch)2495 static int cake_config_diffserv3(struct Qdisc *sch)
2496 {
2497 /*  Simplified Diffserv structure with 3 tins.
2498  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2499  *		Best Effort
2500  *		Low Priority		(LE, CS1)
2501  */
2502 	struct cake_sched_data *q = qdisc_priv(sch);
2503 	u32 mtu = psched_mtu(qdisc_dev(sch));
2504 	u64 rate = q->rate_bps;
2505 	u32 quantum = 1024;
2506 
2507 	q->tin_cnt = 3;
2508 
2509 	/* codepoint to class mapping */
2510 	q->tin_index = diffserv3;
2511 	q->tin_order = bulk_order;
2512 
2513 	/* class characteristics */
2514 	cake_set_rate(&q->tins[0], rate, mtu,
2515 		      us_to_ns(q->target), us_to_ns(q->interval));
2516 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2517 		      us_to_ns(q->target), us_to_ns(q->interval));
2518 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2519 		      us_to_ns(q->target), us_to_ns(q->interval));
2520 
2521 	/* bandwidth-sharing weights */
2522 	q->tins[0].tin_quantum = quantum;
2523 	q->tins[1].tin_quantum = quantum >> 4;
2524 	q->tins[2].tin_quantum = quantum >> 2;
2525 
2526 	return 0;
2527 }
2528 
cake_reconfigure(struct Qdisc * sch)2529 static void cake_reconfigure(struct Qdisc *sch)
2530 {
2531 	struct cake_sched_data *q = qdisc_priv(sch);
2532 	int c, ft;
2533 
2534 	switch (q->tin_mode) {
2535 	case CAKE_DIFFSERV_BESTEFFORT:
2536 		ft = cake_config_besteffort(sch);
2537 		break;
2538 
2539 	case CAKE_DIFFSERV_PRECEDENCE:
2540 		ft = cake_config_precedence(sch);
2541 		break;
2542 
2543 	case CAKE_DIFFSERV_DIFFSERV8:
2544 		ft = cake_config_diffserv8(sch);
2545 		break;
2546 
2547 	case CAKE_DIFFSERV_DIFFSERV4:
2548 		ft = cake_config_diffserv4(sch);
2549 		break;
2550 
2551 	case CAKE_DIFFSERV_DIFFSERV3:
2552 	default:
2553 		ft = cake_config_diffserv3(sch);
2554 		break;
2555 	}
2556 
2557 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2558 		cake_clear_tin(sch, c);
2559 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2560 	}
2561 
2562 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2563 	q->rate_shft = q->tins[ft].tin_rate_shft;
2564 
2565 	if (q->buffer_config_limit) {
2566 		q->buffer_limit = q->buffer_config_limit;
2567 	} else if (q->rate_bps) {
2568 		u64 t = q->rate_bps * q->interval;
2569 
2570 		do_div(t, USEC_PER_SEC / 4);
2571 		q->buffer_limit = max_t(u32, t, 4U << 20);
2572 	} else {
2573 		q->buffer_limit = ~0;
2574 	}
2575 
2576 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2577 
2578 	q->buffer_limit = min(q->buffer_limit,
2579 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2580 				  q->buffer_config_limit));
2581 }
2582 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2583 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2584 		       struct netlink_ext_ack *extack)
2585 {
2586 	struct cake_sched_data *q = qdisc_priv(sch);
2587 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2588 	int err;
2589 
2590 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2591 					  extack);
2592 	if (err < 0)
2593 		return err;
2594 
2595 	if (tb[TCA_CAKE_NAT]) {
2596 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2597 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2598 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2599 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2600 #else
2601 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2602 				    "No conntrack support in kernel");
2603 		return -EOPNOTSUPP;
2604 #endif
2605 	}
2606 
2607 	if (tb[TCA_CAKE_BASE_RATE64])
2608 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2609 
2610 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2611 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2612 
2613 	if (tb[TCA_CAKE_WASH]) {
2614 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2615 			q->rate_flags |= CAKE_FLAG_WASH;
2616 		else
2617 			q->rate_flags &= ~CAKE_FLAG_WASH;
2618 	}
2619 
2620 	if (tb[TCA_CAKE_FLOW_MODE])
2621 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2622 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2623 					CAKE_FLOW_MASK));
2624 
2625 	if (tb[TCA_CAKE_ATM])
2626 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2627 
2628 	if (tb[TCA_CAKE_OVERHEAD]) {
2629 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2630 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2631 
2632 		q->max_netlen = 0;
2633 		q->max_adjlen = 0;
2634 		q->min_netlen = ~0;
2635 		q->min_adjlen = ~0;
2636 	}
2637 
2638 	if (tb[TCA_CAKE_RAW]) {
2639 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2640 
2641 		q->max_netlen = 0;
2642 		q->max_adjlen = 0;
2643 		q->min_netlen = ~0;
2644 		q->min_adjlen = ~0;
2645 	}
2646 
2647 	if (tb[TCA_CAKE_MPU])
2648 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2649 
2650 	if (tb[TCA_CAKE_RTT]) {
2651 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2652 
2653 		if (!q->interval)
2654 			q->interval = 1;
2655 	}
2656 
2657 	if (tb[TCA_CAKE_TARGET]) {
2658 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2659 
2660 		if (!q->target)
2661 			q->target = 1;
2662 	}
2663 
2664 	if (tb[TCA_CAKE_AUTORATE]) {
2665 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2666 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2667 		else
2668 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2669 	}
2670 
2671 	if (tb[TCA_CAKE_INGRESS]) {
2672 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2673 			q->rate_flags |= CAKE_FLAG_INGRESS;
2674 		else
2675 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2676 	}
2677 
2678 	if (tb[TCA_CAKE_ACK_FILTER])
2679 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2680 
2681 	if (tb[TCA_CAKE_MEMORY])
2682 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2683 
2684 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2685 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2686 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2687 		else
2688 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2689 	}
2690 
2691 	if (tb[TCA_CAKE_FWMARK]) {
2692 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2693 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2694 	}
2695 
2696 	if (q->tins) {
2697 		sch_tree_lock(sch);
2698 		cake_reconfigure(sch);
2699 		sch_tree_unlock(sch);
2700 	}
2701 
2702 	return 0;
2703 }
2704 
cake_destroy(struct Qdisc * sch)2705 static void cake_destroy(struct Qdisc *sch)
2706 {
2707 	struct cake_sched_data *q = qdisc_priv(sch);
2708 
2709 	qdisc_watchdog_cancel(&q->watchdog);
2710 	tcf_block_put(q->block);
2711 	kvfree(q->tins);
2712 }
2713 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2714 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2715 		     struct netlink_ext_ack *extack)
2716 {
2717 	struct cake_sched_data *q = qdisc_priv(sch);
2718 	int i, j, err;
2719 
2720 	sch->limit = 10240;
2721 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2722 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2723 
2724 	q->rate_bps = 0; /* unlimited by default */
2725 
2726 	q->interval = 100000; /* 100ms default */
2727 	q->target   =   5000; /* 5ms: codel RFC argues
2728 			       * for 5 to 10% of interval
2729 			       */
2730 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2731 	q->cur_tin = 0;
2732 	q->cur_flow  = 0;
2733 
2734 	qdisc_watchdog_init(&q->watchdog, sch);
2735 
2736 	if (opt) {
2737 		err = cake_change(sch, opt, extack);
2738 
2739 		if (err)
2740 			return err;
2741 	}
2742 
2743 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2744 	if (err)
2745 		return err;
2746 
2747 	quantum_div[0] = ~0;
2748 	for (i = 1; i <= CAKE_QUEUES; i++)
2749 		quantum_div[i] = 65535 / i;
2750 
2751 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2752 			   GFP_KERNEL);
2753 	if (!q->tins)
2754 		return -ENOMEM;
2755 
2756 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2757 		struct cake_tin_data *b = q->tins + i;
2758 
2759 		INIT_LIST_HEAD(&b->new_flows);
2760 		INIT_LIST_HEAD(&b->old_flows);
2761 		INIT_LIST_HEAD(&b->decaying_flows);
2762 		b->sparse_flow_count = 0;
2763 		b->bulk_flow_count = 0;
2764 		b->decaying_flow_count = 0;
2765 
2766 		for (j = 0; j < CAKE_QUEUES; j++) {
2767 			struct cake_flow *flow = b->flows + j;
2768 			u32 k = j * CAKE_MAX_TINS + i;
2769 
2770 			INIT_LIST_HEAD(&flow->flowchain);
2771 			cobalt_vars_init(&flow->cvars);
2772 
2773 			q->overflow_heap[k].t = i;
2774 			q->overflow_heap[k].b = j;
2775 			b->overflow_idx[j] = k;
2776 		}
2777 	}
2778 
2779 	cake_reconfigure(sch);
2780 	q->avg_peak_bandwidth = q->rate_bps;
2781 	q->min_netlen = ~0;
2782 	q->min_adjlen = ~0;
2783 	return 0;
2784 }
2785 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2786 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2787 {
2788 	struct cake_sched_data *q = qdisc_priv(sch);
2789 	struct nlattr *opts;
2790 
2791 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2792 	if (!opts)
2793 		goto nla_put_failure;
2794 
2795 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2796 			      TCA_CAKE_PAD))
2797 		goto nla_put_failure;
2798 
2799 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2800 			q->flow_mode & CAKE_FLOW_MASK))
2801 		goto nla_put_failure;
2802 
2803 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2804 		goto nla_put_failure;
2805 
2806 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2807 		goto nla_put_failure;
2808 
2809 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2810 		goto nla_put_failure;
2811 
2812 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2813 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2814 		goto nla_put_failure;
2815 
2816 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2817 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2818 		goto nla_put_failure;
2819 
2820 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2821 		goto nla_put_failure;
2822 
2823 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2824 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2825 		goto nla_put_failure;
2826 
2827 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2828 		goto nla_put_failure;
2829 
2830 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2831 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2832 		goto nla_put_failure;
2833 
2834 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2835 		goto nla_put_failure;
2836 
2837 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2838 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2839 			goto nla_put_failure;
2840 
2841 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2842 		goto nla_put_failure;
2843 
2844 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2845 		goto nla_put_failure;
2846 
2847 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2848 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2849 		goto nla_put_failure;
2850 
2851 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2852 		goto nla_put_failure;
2853 
2854 	return nla_nest_end(skb, opts);
2855 
2856 nla_put_failure:
2857 	return -1;
2858 }
2859 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2860 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2861 {
2862 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2863 	struct cake_sched_data *q = qdisc_priv(sch);
2864 	struct nlattr *tstats, *ts;
2865 	int i;
2866 
2867 	if (!stats)
2868 		return -1;
2869 
2870 #define PUT_STAT_U32(attr, data) do {				       \
2871 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2872 			goto nla_put_failure;			       \
2873 	} while (0)
2874 #define PUT_STAT_U64(attr, data) do {				       \
2875 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2876 					data, TCA_CAKE_STATS_PAD)) \
2877 			goto nla_put_failure;			       \
2878 	} while (0)
2879 
2880 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2881 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2882 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2883 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2884 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2885 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2886 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2887 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2888 
2889 #undef PUT_STAT_U32
2890 #undef PUT_STAT_U64
2891 
2892 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2893 	if (!tstats)
2894 		goto nla_put_failure;
2895 
2896 #define PUT_TSTAT_U32(attr, data) do {					\
2897 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2898 			goto nla_put_failure;				\
2899 	} while (0)
2900 #define PUT_TSTAT_U64(attr, data) do {					\
2901 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2902 					data, TCA_CAKE_TIN_STATS_PAD))	\
2903 			goto nla_put_failure;				\
2904 	} while (0)
2905 
2906 	for (i = 0; i < q->tin_cnt; i++) {
2907 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2908 
2909 		ts = nla_nest_start_noflag(d->skb, i + 1);
2910 		if (!ts)
2911 			goto nla_put_failure;
2912 
2913 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2914 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2915 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2916 
2917 		PUT_TSTAT_U32(TARGET_US,
2918 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2919 		PUT_TSTAT_U32(INTERVAL_US,
2920 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2921 
2922 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2923 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2924 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2925 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2926 
2927 		PUT_TSTAT_U32(PEAK_DELAY_US,
2928 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2929 		PUT_TSTAT_U32(AVG_DELAY_US,
2930 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2931 		PUT_TSTAT_U32(BASE_DELAY_US,
2932 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2933 
2934 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2935 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2936 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2937 
2938 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2939 					    b->decaying_flow_count);
2940 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2941 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2942 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2943 
2944 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2945 		nla_nest_end(d->skb, ts);
2946 	}
2947 
2948 #undef PUT_TSTAT_U32
2949 #undef PUT_TSTAT_U64
2950 
2951 	nla_nest_end(d->skb, tstats);
2952 	return nla_nest_end(d->skb, stats);
2953 
2954 nla_put_failure:
2955 	nla_nest_cancel(d->skb, stats);
2956 	return -1;
2957 }
2958 
cake_leaf(struct Qdisc * sch,unsigned long arg)2959 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2960 {
2961 	return NULL;
2962 }
2963 
cake_find(struct Qdisc * sch,u32 classid)2964 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2965 {
2966 	return 0;
2967 }
2968 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2969 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2970 			       u32 classid)
2971 {
2972 	return 0;
2973 }
2974 
cake_unbind(struct Qdisc * q,unsigned long cl)2975 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2976 {
2977 }
2978 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2979 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2980 					struct netlink_ext_ack *extack)
2981 {
2982 	struct cake_sched_data *q = qdisc_priv(sch);
2983 
2984 	if (cl)
2985 		return NULL;
2986 	return q->block;
2987 }
2988 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2989 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2990 			   struct sk_buff *skb, struct tcmsg *tcm)
2991 {
2992 	tcm->tcm_handle |= TC_H_MIN(cl);
2993 	return 0;
2994 }
2995 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2996 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2997 				 struct gnet_dump *d)
2998 {
2999 	struct cake_sched_data *q = qdisc_priv(sch);
3000 	const struct cake_flow *flow = NULL;
3001 	struct gnet_stats_queue qs = { 0 };
3002 	struct nlattr *stats;
3003 	u32 idx = cl - 1;
3004 
3005 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3006 		const struct cake_tin_data *b = \
3007 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3008 		const struct sk_buff *skb;
3009 
3010 		flow = &b->flows[idx % CAKE_QUEUES];
3011 
3012 		if (flow->head) {
3013 			sch_tree_lock(sch);
3014 			skb = flow->head;
3015 			while (skb) {
3016 				qs.qlen++;
3017 				skb = skb->next;
3018 			}
3019 			sch_tree_unlock(sch);
3020 		}
3021 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3022 		qs.drops = flow->dropped;
3023 	}
3024 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3025 		return -1;
3026 	if (flow) {
3027 		ktime_t now = ktime_get();
3028 
3029 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3030 		if (!stats)
3031 			return -1;
3032 
3033 #define PUT_STAT_U32(attr, data) do {				       \
3034 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3035 			goto nla_put_failure;			       \
3036 	} while (0)
3037 #define PUT_STAT_S32(attr, data) do {				       \
3038 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3039 			goto nla_put_failure;			       \
3040 	} while (0)
3041 
3042 		PUT_STAT_S32(DEFICIT, flow->deficit);
3043 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3044 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3045 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3046 		if (flow->cvars.p_drop) {
3047 			PUT_STAT_S32(BLUE_TIMER_US,
3048 				     ktime_to_us(
3049 					     ktime_sub(now,
3050 						       flow->cvars.blue_timer)));
3051 		}
3052 		if (flow->cvars.dropping) {
3053 			PUT_STAT_S32(DROP_NEXT_US,
3054 				     ktime_to_us(
3055 					     ktime_sub(now,
3056 						       flow->cvars.drop_next)));
3057 		}
3058 
3059 		if (nla_nest_end(d->skb, stats) < 0)
3060 			return -1;
3061 	}
3062 
3063 	return 0;
3064 
3065 nla_put_failure:
3066 	nla_nest_cancel(d->skb, stats);
3067 	return -1;
3068 }
3069 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3070 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3071 {
3072 	struct cake_sched_data *q = qdisc_priv(sch);
3073 	unsigned int i, j;
3074 
3075 	if (arg->stop)
3076 		return;
3077 
3078 	for (i = 0; i < q->tin_cnt; i++) {
3079 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3080 
3081 		for (j = 0; j < CAKE_QUEUES; j++) {
3082 			if (list_empty(&b->flows[j].flowchain)) {
3083 				arg->count++;
3084 				continue;
3085 			}
3086 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3087 						 arg))
3088 				break;
3089 		}
3090 	}
3091 }
3092 
3093 static const struct Qdisc_class_ops cake_class_ops = {
3094 	.leaf		=	cake_leaf,
3095 	.find		=	cake_find,
3096 	.tcf_block	=	cake_tcf_block,
3097 	.bind_tcf	=	cake_bind,
3098 	.unbind_tcf	=	cake_unbind,
3099 	.dump		=	cake_dump_class,
3100 	.dump_stats	=	cake_dump_class_stats,
3101 	.walk		=	cake_walk,
3102 };
3103 
3104 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3105 	.cl_ops		=	&cake_class_ops,
3106 	.id		=	"cake",
3107 	.priv_size	=	sizeof(struct cake_sched_data),
3108 	.enqueue	=	cake_enqueue,
3109 	.dequeue	=	cake_dequeue,
3110 	.peek		=	qdisc_peek_dequeued,
3111 	.init		=	cake_init,
3112 	.reset		=	cake_reset,
3113 	.destroy	=	cake_destroy,
3114 	.change		=	cake_change,
3115 	.dump		=	cake_dump,
3116 	.dump_stats	=	cake_dump_stats,
3117 	.owner		=	THIS_MODULE,
3118 };
3119 
cake_module_init(void)3120 static int __init cake_module_init(void)
3121 {
3122 	return register_qdisc(&cake_qdisc_ops);
3123 }
3124 
cake_module_exit(void)3125 static void __exit cake_module_exit(void)
3126 {
3127 	unregister_qdisc(&cake_qdisc_ops);
3128 }
3129 
3130 module_init(cake_module_init)
3131 module_exit(cake_module_exit)
3132 MODULE_AUTHOR("Jonathan Morton");
3133 MODULE_LICENSE("Dual BSD/GPL");
3134 MODULE_DESCRIPTION("The CAKE shaper.");
3135