xref: /openbmc/linux/net/sched/sch_cake.c (revision b60a5b8d)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum_prio;
177 	u16	tin_quantum_band;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
215 	u16		rate_shft;
216 	ktime_t		time_next_packet;
217 	ktime_t		failsafe_next_packet;
218 	u64		rate_ns;
219 	u64		rate_bps;
220 	u16		rate_flags;
221 	s16		rate_overhead;
222 	u16		rate_mpu;
223 	u64		interval;
224 	u64		target;
225 
226 	/* resource tracking */
227 	u32		buffer_used;
228 	u32		buffer_max_used;
229 	u32		buffer_limit;
230 	u32		buffer_config_limit;
231 
232 	/* indices for dequeue */
233 	u16		cur_tin;
234 	u16		cur_flow;
235 
236 	struct qdisc_watchdog watchdog;
237 	const u8	*tin_index;
238 	const u8	*tin_order;
239 
240 	/* bandwidth capacity estimate */
241 	ktime_t		last_packet_time;
242 	ktime_t		avg_window_begin;
243 	u64		avg_packet_interval;
244 	u64		avg_window_bytes;
245 	u64		avg_peak_bandwidth;
246 	ktime_t		last_reconfig_time;
247 
248 	/* packet length stats */
249 	u32		avg_netoff;
250 	u16		max_netlen;
251 	u16		max_adjlen;
252 	u16		min_netlen;
253 	u16		min_adjlen;
254 };
255 
256 enum {
257 	CAKE_FLAG_OVERHEAD	   = BIT(0),
258 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
259 	CAKE_FLAG_INGRESS	   = BIT(2),
260 	CAKE_FLAG_WASH		   = BIT(3),
261 	CAKE_FLAG_SPLIT_GSO	   = BIT(4),
262 	CAKE_FLAG_FWMARK	   = BIT(5)
263 };
264 
265 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
266  * obtain the best features of each.  Codel is excellent on flows which
267  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
268  * unresponsive flows.
269  */
270 
271 struct cobalt_skb_cb {
272 	ktime_t enqueue_time;
273 	u32     adjusted_len;
274 };
275 
276 static u64 us_to_ns(u64 us)
277 {
278 	return us * NSEC_PER_USEC;
279 }
280 
281 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
282 {
283 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
284 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
285 }
286 
287 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
288 {
289 	return get_cobalt_cb(skb)->enqueue_time;
290 }
291 
292 static void cobalt_set_enqueue_time(struct sk_buff *skb,
293 				    ktime_t now)
294 {
295 	get_cobalt_cb(skb)->enqueue_time = now;
296 }
297 
298 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
299 
300 /* Diffserv lookup tables */
301 
302 static const u8 precedence[] = {
303 	0, 0, 0, 0, 0, 0, 0, 0,
304 	1, 1, 1, 1, 1, 1, 1, 1,
305 	2, 2, 2, 2, 2, 2, 2, 2,
306 	3, 3, 3, 3, 3, 3, 3, 3,
307 	4, 4, 4, 4, 4, 4, 4, 4,
308 	5, 5, 5, 5, 5, 5, 5, 5,
309 	6, 6, 6, 6, 6, 6, 6, 6,
310 	7, 7, 7, 7, 7, 7, 7, 7,
311 };
312 
313 static const u8 diffserv8[] = {
314 	2, 5, 1, 2, 4, 2, 2, 2,
315 	0, 2, 1, 2, 1, 2, 1, 2,
316 	5, 2, 4, 2, 4, 2, 4, 2,
317 	3, 2, 3, 2, 3, 2, 3, 2,
318 	6, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 2, 2, 6, 2, 6, 2,
320 	7, 2, 2, 2, 2, 2, 2, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 };
323 
324 static const u8 diffserv4[] = {
325 	0, 2, 0, 0, 2, 0, 0, 0,
326 	1, 0, 0, 0, 0, 0, 0, 0,
327 	2, 0, 2, 0, 2, 0, 2, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	3, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 0, 0, 3, 0, 3, 0,
331 	3, 0, 0, 0, 0, 0, 0, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 };
334 
335 static const u8 diffserv3[] = {
336 	0, 0, 0, 0, 2, 0, 0, 0,
337 	1, 0, 0, 0, 0, 0, 0, 0,
338 	0, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 2, 0, 2, 0,
342 	2, 0, 0, 0, 0, 0, 0, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 };
345 
346 static const u8 besteffort[] = {
347 	0, 0, 0, 0, 0, 0, 0, 0,
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 };
356 
357 /* tin priority order for stats dumping */
358 
359 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
360 static const u8 bulk_order[] = {1, 0, 2, 3};
361 
362 #define REC_INV_SQRT_CACHE (16)
363 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
364 
365 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
366  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
367  *
368  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
369  */
370 
371 static void cobalt_newton_step(struct cobalt_vars *vars)
372 {
373 	u32 invsqrt, invsqrt2;
374 	u64 val;
375 
376 	invsqrt = vars->rec_inv_sqrt;
377 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
378 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
379 
380 	val >>= 2; /* avoid overflow in following multiply */
381 	val = (val * invsqrt) >> (32 - 2 + 1);
382 
383 	vars->rec_inv_sqrt = val;
384 }
385 
386 static void cobalt_invsqrt(struct cobalt_vars *vars)
387 {
388 	if (vars->count < REC_INV_SQRT_CACHE)
389 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
390 	else
391 		cobalt_newton_step(vars);
392 }
393 
394 /* There is a big difference in timing between the accurate values placed in
395  * the cache and the approximations given by a single Newton step for small
396  * count values, particularly when stepping from count 1 to 2 or vice versa.
397  * Above 16, a single Newton step gives sufficient accuracy in either
398  * direction, given the precision stored.
399  *
400  * The magnitude of the error when stepping up to count 2 is such as to give
401  * the value that *should* have been produced at count 4.
402  */
403 
404 static void cobalt_cache_init(void)
405 {
406 	struct cobalt_vars v;
407 
408 	memset(&v, 0, sizeof(v));
409 	v.rec_inv_sqrt = ~0U;
410 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
411 
412 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
413 		cobalt_newton_step(&v);
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 
418 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
419 	}
420 }
421 
422 static void cobalt_vars_init(struct cobalt_vars *vars)
423 {
424 	memset(vars, 0, sizeof(*vars));
425 
426 	if (!cobalt_rec_inv_sqrt_cache[0]) {
427 		cobalt_cache_init();
428 		cobalt_rec_inv_sqrt_cache[0] = ~0;
429 	}
430 }
431 
432 /* CoDel control_law is t + interval/sqrt(count)
433  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
434  * both sqrt() and divide operation.
435  */
436 static ktime_t cobalt_control(ktime_t t,
437 			      u64 interval,
438 			      u32 rec_inv_sqrt)
439 {
440 	return ktime_add_ns(t, reciprocal_scale(interval,
441 						rec_inv_sqrt));
442 }
443 
444 /* Call this when a packet had to be dropped due to queue overflow.  Returns
445  * true if the BLUE state was quiescent before but active after this call.
446  */
447 static bool cobalt_queue_full(struct cobalt_vars *vars,
448 			      struct cobalt_params *p,
449 			      ktime_t now)
450 {
451 	bool up = false;
452 
453 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
454 		up = !vars->p_drop;
455 		vars->p_drop += p->p_inc;
456 		if (vars->p_drop < p->p_inc)
457 			vars->p_drop = ~0;
458 		vars->blue_timer = now;
459 	}
460 	vars->dropping = true;
461 	vars->drop_next = now;
462 	if (!vars->count)
463 		vars->count = 1;
464 
465 	return up;
466 }
467 
468 /* Call this when the queue was serviced but turned out to be empty.  Returns
469  * true if the BLUE state was active before but quiescent after this call.
470  */
471 static bool cobalt_queue_empty(struct cobalt_vars *vars,
472 			       struct cobalt_params *p,
473 			       ktime_t now)
474 {
475 	bool down = false;
476 
477 	if (vars->p_drop &&
478 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
479 		if (vars->p_drop < p->p_dec)
480 			vars->p_drop = 0;
481 		else
482 			vars->p_drop -= p->p_dec;
483 		vars->blue_timer = now;
484 		down = !vars->p_drop;
485 	}
486 	vars->dropping = false;
487 
488 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
489 		vars->count--;
490 		cobalt_invsqrt(vars);
491 		vars->drop_next = cobalt_control(vars->drop_next,
492 						 p->interval,
493 						 vars->rec_inv_sqrt);
494 	}
495 
496 	return down;
497 }
498 
499 /* Call this with a freshly dequeued packet for possible congestion marking.
500  * Returns true as an instruction to drop the packet, false for delivery.
501  */
502 static bool cobalt_should_drop(struct cobalt_vars *vars,
503 			       struct cobalt_params *p,
504 			       ktime_t now,
505 			       struct sk_buff *skb,
506 			       u32 bulk_flows)
507 {
508 	bool next_due, over_target, drop = false;
509 	ktime_t schedule;
510 	u64 sojourn;
511 
512 /* The 'schedule' variable records, in its sign, whether 'now' is before or
513  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
514  * scheduling decision is actually branched, without destroying that
515  * information.  Similarly, the first 'schedule' value calculated is preserved
516  * in the boolean 'next_due'.
517  *
518  * As for 'drop_next', we take advantage of the fact that 'interval' is both
519  * the delay between first exceeding 'target' and the first signalling event,
520  * *and* the scaling factor for the signalling frequency.  It's therefore very
521  * natural to use a single mechanism for both purposes, and eliminates a
522  * significant amount of reference Codel's spaghetti code.  To help with this,
523  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
524  * as possible to 1.0 in fixed-point.
525  */
526 
527 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
528 	schedule = ktime_sub(now, vars->drop_next);
529 	over_target = sojourn > p->target &&
530 		      sojourn > p->mtu_time * bulk_flows * 2 &&
531 		      sojourn > p->mtu_time * 4;
532 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
533 
534 	vars->ecn_marked = false;
535 
536 	if (over_target) {
537 		if (!vars->dropping) {
538 			vars->dropping = true;
539 			vars->drop_next = cobalt_control(now,
540 							 p->interval,
541 							 vars->rec_inv_sqrt);
542 		}
543 		if (!vars->count)
544 			vars->count = 1;
545 	} else if (vars->dropping) {
546 		vars->dropping = false;
547 	}
548 
549 	if (next_due && vars->dropping) {
550 		/* Use ECN mark if possible, otherwise drop */
551 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
552 
553 		vars->count++;
554 		if (!vars->count)
555 			vars->count--;
556 		cobalt_invsqrt(vars);
557 		vars->drop_next = cobalt_control(vars->drop_next,
558 						 p->interval,
559 						 vars->rec_inv_sqrt);
560 		schedule = ktime_sub(now, vars->drop_next);
561 	} else {
562 		while (next_due) {
563 			vars->count--;
564 			cobalt_invsqrt(vars);
565 			vars->drop_next = cobalt_control(vars->drop_next,
566 							 p->interval,
567 							 vars->rec_inv_sqrt);
568 			schedule = ktime_sub(now, vars->drop_next);
569 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
570 		}
571 	}
572 
573 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
574 	if (vars->p_drop)
575 		drop |= (prandom_u32() < vars->p_drop);
576 
577 	/* Overload the drop_next field as an activity timeout */
578 	if (!vars->count)
579 		vars->drop_next = ktime_add_ns(now, p->interval);
580 	else if (ktime_to_ns(schedule) > 0 && !drop)
581 		vars->drop_next = now;
582 
583 	return drop;
584 }
585 
586 static void cake_update_flowkeys(struct flow_keys *keys,
587 				 const struct sk_buff *skb)
588 {
589 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
590 	struct nf_conntrack_tuple tuple = {};
591 	bool rev = !skb->_nfct;
592 
593 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
594 		return;
595 
596 	if (!nf_ct_get_tuple_skb(&tuple, skb))
597 		return;
598 
599 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
600 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
601 
602 	if (keys->ports.ports) {
603 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
604 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
605 	}
606 #endif
607 }
608 
609 /* Cake has several subtle multiple bit settings. In these cases you
610  *  would be matching triple isolate mode as well.
611  */
612 
613 static bool cake_dsrc(int flow_mode)
614 {
615 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
616 }
617 
618 static bool cake_ddst(int flow_mode)
619 {
620 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
621 }
622 
623 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
624 		     int flow_mode, u16 flow_override, u16 host_override)
625 {
626 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
627 	u16 reduced_hash, srchost_idx, dsthost_idx;
628 	struct flow_keys keys, host_keys;
629 
630 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
631 		return 0;
632 
633 	/* If both overrides are set we can skip packet dissection entirely */
634 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
635 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
636 		goto skip_hash;
637 
638 	skb_flow_dissect_flow_keys(skb, &keys,
639 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
640 
641 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
642 		cake_update_flowkeys(&keys, skb);
643 
644 	/* flow_hash_from_keys() sorts the addresses by value, so we have
645 	 * to preserve their order in a separate data structure to treat
646 	 * src and dst host addresses as independently selectable.
647 	 */
648 	host_keys = keys;
649 	host_keys.ports.ports     = 0;
650 	host_keys.basic.ip_proto  = 0;
651 	host_keys.keyid.keyid     = 0;
652 	host_keys.tags.flow_label = 0;
653 
654 	switch (host_keys.control.addr_type) {
655 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
656 		host_keys.addrs.v4addrs.src = 0;
657 		dsthost_hash = flow_hash_from_keys(&host_keys);
658 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
659 		host_keys.addrs.v4addrs.dst = 0;
660 		srchost_hash = flow_hash_from_keys(&host_keys);
661 		break;
662 
663 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
664 		memset(&host_keys.addrs.v6addrs.src, 0,
665 		       sizeof(host_keys.addrs.v6addrs.src));
666 		dsthost_hash = flow_hash_from_keys(&host_keys);
667 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
668 		memset(&host_keys.addrs.v6addrs.dst, 0,
669 		       sizeof(host_keys.addrs.v6addrs.dst));
670 		srchost_hash = flow_hash_from_keys(&host_keys);
671 		break;
672 
673 	default:
674 		dsthost_hash = 0;
675 		srchost_hash = 0;
676 	}
677 
678 	/* This *must* be after the above switch, since as a
679 	 * side-effect it sorts the src and dst addresses.
680 	 */
681 	if (flow_mode & CAKE_FLOW_FLOWS)
682 		flow_hash = flow_hash_from_keys(&keys);
683 
684 skip_hash:
685 	if (flow_override)
686 		flow_hash = flow_override - 1;
687 	if (host_override) {
688 		dsthost_hash = host_override - 1;
689 		srchost_hash = host_override - 1;
690 	}
691 
692 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
693 		if (flow_mode & CAKE_FLOW_SRC_IP)
694 			flow_hash ^= srchost_hash;
695 
696 		if (flow_mode & CAKE_FLOW_DST_IP)
697 			flow_hash ^= dsthost_hash;
698 	}
699 
700 	reduced_hash = flow_hash % CAKE_QUEUES;
701 
702 	/* set-associative hashing */
703 	/* fast path if no hash collision (direct lookup succeeds) */
704 	if (likely(q->tags[reduced_hash] == flow_hash &&
705 		   q->flows[reduced_hash].set)) {
706 		q->way_directs++;
707 	} else {
708 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
709 		u32 outer_hash = reduced_hash - inner_hash;
710 		bool allocate_src = false;
711 		bool allocate_dst = false;
712 		u32 i, k;
713 
714 		/* check if any active queue in the set is reserved for
715 		 * this flow.
716 		 */
717 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
718 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
719 			if (q->tags[outer_hash + k] == flow_hash) {
720 				if (i)
721 					q->way_hits++;
722 
723 				if (!q->flows[outer_hash + k].set) {
724 					/* need to increment host refcnts */
725 					allocate_src = cake_dsrc(flow_mode);
726 					allocate_dst = cake_ddst(flow_mode);
727 				}
728 
729 				goto found;
730 			}
731 		}
732 
733 		/* no queue is reserved for this flow, look for an
734 		 * empty one.
735 		 */
736 		for (i = 0; i < CAKE_SET_WAYS;
737 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
738 			if (!q->flows[outer_hash + k].set) {
739 				q->way_misses++;
740 				allocate_src = cake_dsrc(flow_mode);
741 				allocate_dst = cake_ddst(flow_mode);
742 				goto found;
743 			}
744 		}
745 
746 		/* With no empty queues, default to the original
747 		 * queue, accept the collision, update the host tags.
748 		 */
749 		q->way_collisions++;
750 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
751 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
752 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
753 		}
754 		allocate_src = cake_dsrc(flow_mode);
755 		allocate_dst = cake_ddst(flow_mode);
756 found:
757 		/* reserve queue for future packets in same flow */
758 		reduced_hash = outer_hash + k;
759 		q->tags[reduced_hash] = flow_hash;
760 
761 		if (allocate_src) {
762 			srchost_idx = srchost_hash % CAKE_QUEUES;
763 			inner_hash = srchost_idx % CAKE_SET_WAYS;
764 			outer_hash = srchost_idx - inner_hash;
765 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
766 				i++, k = (k + 1) % CAKE_SET_WAYS) {
767 				if (q->hosts[outer_hash + k].srchost_tag ==
768 				    srchost_hash)
769 					goto found_src;
770 			}
771 			for (i = 0; i < CAKE_SET_WAYS;
772 				i++, k = (k + 1) % CAKE_SET_WAYS) {
773 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
774 					break;
775 			}
776 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
777 found_src:
778 			srchost_idx = outer_hash + k;
779 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
780 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
781 			q->flows[reduced_hash].srchost = srchost_idx;
782 		}
783 
784 		if (allocate_dst) {
785 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
786 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
787 			outer_hash = dsthost_idx - inner_hash;
788 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
789 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
790 				if (q->hosts[outer_hash + k].dsthost_tag ==
791 				    dsthost_hash)
792 					goto found_dst;
793 			}
794 			for (i = 0; i < CAKE_SET_WAYS;
795 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
796 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
797 					break;
798 			}
799 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
800 found_dst:
801 			dsthost_idx = outer_hash + k;
802 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
803 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
804 			q->flows[reduced_hash].dsthost = dsthost_idx;
805 		}
806 	}
807 
808 	return reduced_hash;
809 }
810 
811 /* helper functions : might be changed when/if skb use a standard list_head */
812 /* remove one skb from head of slot queue */
813 
814 static struct sk_buff *dequeue_head(struct cake_flow *flow)
815 {
816 	struct sk_buff *skb = flow->head;
817 
818 	if (skb) {
819 		flow->head = skb->next;
820 		skb_mark_not_on_list(skb);
821 	}
822 
823 	return skb;
824 }
825 
826 /* add skb to flow queue (tail add) */
827 
828 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
829 {
830 	if (!flow->head)
831 		flow->head = skb;
832 	else
833 		flow->tail->next = skb;
834 	flow->tail = skb;
835 	skb->next = NULL;
836 }
837 
838 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
839 				    struct ipv6hdr *buf)
840 {
841 	unsigned int offset = skb_network_offset(skb);
842 	struct iphdr *iph;
843 
844 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
845 
846 	if (!iph)
847 		return NULL;
848 
849 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
850 		return skb_header_pointer(skb, offset + iph->ihl * 4,
851 					  sizeof(struct ipv6hdr), buf);
852 
853 	else if (iph->version == 4)
854 		return iph;
855 
856 	else if (iph->version == 6)
857 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
858 					  buf);
859 
860 	return NULL;
861 }
862 
863 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
864 				      void *buf, unsigned int bufsize)
865 {
866 	unsigned int offset = skb_network_offset(skb);
867 	const struct ipv6hdr *ipv6h;
868 	const struct tcphdr *tcph;
869 	const struct iphdr *iph;
870 	struct ipv6hdr _ipv6h;
871 	struct tcphdr _tcph;
872 
873 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
874 
875 	if (!ipv6h)
876 		return NULL;
877 
878 	if (ipv6h->version == 4) {
879 		iph = (struct iphdr *)ipv6h;
880 		offset += iph->ihl * 4;
881 
882 		/* special-case 6in4 tunnelling, as that is a common way to get
883 		 * v6 connectivity in the home
884 		 */
885 		if (iph->protocol == IPPROTO_IPV6) {
886 			ipv6h = skb_header_pointer(skb, offset,
887 						   sizeof(_ipv6h), &_ipv6h);
888 
889 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
890 				return NULL;
891 
892 			offset += sizeof(struct ipv6hdr);
893 
894 		} else if (iph->protocol != IPPROTO_TCP) {
895 			return NULL;
896 		}
897 
898 	} else if (ipv6h->version == 6) {
899 		if (ipv6h->nexthdr != IPPROTO_TCP)
900 			return NULL;
901 
902 		offset += sizeof(struct ipv6hdr);
903 	} else {
904 		return NULL;
905 	}
906 
907 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
908 	if (!tcph)
909 		return NULL;
910 
911 	return skb_header_pointer(skb, offset,
912 				  min(__tcp_hdrlen(tcph), bufsize), buf);
913 }
914 
915 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
916 				   int code, int *oplen)
917 {
918 	/* inspired by tcp_parse_options in tcp_input.c */
919 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
920 	const u8 *ptr = (const u8 *)(tcph + 1);
921 
922 	while (length > 0) {
923 		int opcode = *ptr++;
924 		int opsize;
925 
926 		if (opcode == TCPOPT_EOL)
927 			break;
928 		if (opcode == TCPOPT_NOP) {
929 			length--;
930 			continue;
931 		}
932 		opsize = *ptr++;
933 		if (opsize < 2 || opsize > length)
934 			break;
935 
936 		if (opcode == code) {
937 			*oplen = opsize;
938 			return ptr;
939 		}
940 
941 		ptr += opsize - 2;
942 		length -= opsize;
943 	}
944 
945 	return NULL;
946 }
947 
948 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
949  * bytes than the other. In the case where both sequences ACKs bytes that the
950  * other doesn't, A is considered greater. DSACKs in A also makes A be
951  * considered greater.
952  *
953  * @return -1, 0 or 1 as normal compare functions
954  */
955 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
956 				  const struct tcphdr *tcph_b)
957 {
958 	const struct tcp_sack_block_wire *sack_a, *sack_b;
959 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
960 	u32 bytes_a = 0, bytes_b = 0;
961 	int oplen_a, oplen_b;
962 	bool first = true;
963 
964 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
965 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
966 
967 	/* pointers point to option contents */
968 	oplen_a -= TCPOLEN_SACK_BASE;
969 	oplen_b -= TCPOLEN_SACK_BASE;
970 
971 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
972 	    (!sack_b || oplen_b < sizeof(*sack_b)))
973 		return -1;
974 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
975 		 (!sack_a || oplen_a < sizeof(*sack_a)))
976 		return 1;
977 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
978 		 (!sack_b || oplen_b < sizeof(*sack_b)))
979 		return 0;
980 
981 	while (oplen_a >= sizeof(*sack_a)) {
982 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
983 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
984 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
985 		int oplen_tmp = oplen_b;
986 		bool found = false;
987 
988 		/* DSACK; always considered greater to prevent dropping */
989 		if (before(start_a, ack_seq_a))
990 			return -1;
991 
992 		bytes_a += end_a - start_a;
993 
994 		while (oplen_tmp >= sizeof(*sack_tmp)) {
995 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
996 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
997 
998 			/* first time through we count the total size */
999 			if (first)
1000 				bytes_b += end_b - start_b;
1001 
1002 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1003 				found = true;
1004 				if (!first)
1005 					break;
1006 			}
1007 			oplen_tmp -= sizeof(*sack_tmp);
1008 			sack_tmp++;
1009 		}
1010 
1011 		if (!found)
1012 			return -1;
1013 
1014 		oplen_a -= sizeof(*sack_a);
1015 		sack_a++;
1016 		first = false;
1017 	}
1018 
1019 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1020 	 * either the SACKs are equal, or B SACKs more bytes.
1021 	 */
1022 	return bytes_b > bytes_a ? 1 : 0;
1023 }
1024 
1025 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1026 				 u32 *tsval, u32 *tsecr)
1027 {
1028 	const u8 *ptr;
1029 	int opsize;
1030 
1031 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1032 
1033 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1034 		*tsval = get_unaligned_be32(ptr);
1035 		*tsecr = get_unaligned_be32(ptr + 4);
1036 	}
1037 }
1038 
1039 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1040 			       u32 tstamp_new, u32 tsecr_new)
1041 {
1042 	/* inspired by tcp_parse_options in tcp_input.c */
1043 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1044 	const u8 *ptr = (const u8 *)(tcph + 1);
1045 	u32 tstamp, tsecr;
1046 
1047 	/* 3 reserved flags must be unset to avoid future breakage
1048 	 * ACK must be set
1049 	 * ECE/CWR are handled separately
1050 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1051 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1052 	 * 0x00C00000 = CWR/ECE (handled separately)
1053 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1054 	 */
1055 	if (((tcp_flag_word(tcph) &
1056 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1057 		return false;
1058 
1059 	while (length > 0) {
1060 		int opcode = *ptr++;
1061 		int opsize;
1062 
1063 		if (opcode == TCPOPT_EOL)
1064 			break;
1065 		if (opcode == TCPOPT_NOP) {
1066 			length--;
1067 			continue;
1068 		}
1069 		opsize = *ptr++;
1070 		if (opsize < 2 || opsize > length)
1071 			break;
1072 
1073 		switch (opcode) {
1074 		case TCPOPT_MD5SIG: /* doesn't influence state */
1075 			break;
1076 
1077 		case TCPOPT_SACK: /* stricter checking performed later */
1078 			if (opsize % 8 != 2)
1079 				return false;
1080 			break;
1081 
1082 		case TCPOPT_TIMESTAMP:
1083 			/* only drop timestamps lower than new */
1084 			if (opsize != TCPOLEN_TIMESTAMP)
1085 				return false;
1086 			tstamp = get_unaligned_be32(ptr);
1087 			tsecr = get_unaligned_be32(ptr + 4);
1088 			if (after(tstamp, tstamp_new) ||
1089 			    after(tsecr, tsecr_new))
1090 				return false;
1091 			break;
1092 
1093 		case TCPOPT_MSS:  /* these should only be set on SYN */
1094 		case TCPOPT_WINDOW:
1095 		case TCPOPT_SACK_PERM:
1096 		case TCPOPT_FASTOPEN:
1097 		case TCPOPT_EXP:
1098 		default: /* don't drop if any unknown options are present */
1099 			return false;
1100 		}
1101 
1102 		ptr += opsize - 2;
1103 		length -= opsize;
1104 	}
1105 
1106 	return true;
1107 }
1108 
1109 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1110 				       struct cake_flow *flow)
1111 {
1112 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1113 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1114 	struct sk_buff *skb_check, *skb_prev = NULL;
1115 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1116 	unsigned char _tcph[64], _tcph_check[64];
1117 	const struct tcphdr *tcph, *tcph_check;
1118 	const struct iphdr *iph, *iph_check;
1119 	struct ipv6hdr _iph, _iph_check;
1120 	const struct sk_buff *skb;
1121 	int seglen, num_found = 0;
1122 	u32 tstamp = 0, tsecr = 0;
1123 	__be32 elig_flags = 0;
1124 	int sack_comp;
1125 
1126 	/* no other possible ACKs to filter */
1127 	if (flow->head == flow->tail)
1128 		return NULL;
1129 
1130 	skb = flow->tail;
1131 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1132 	iph = cake_get_iphdr(skb, &_iph);
1133 	if (!tcph)
1134 		return NULL;
1135 
1136 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1137 
1138 	/* the 'triggering' packet need only have the ACK flag set.
1139 	 * also check that SYN is not set, as there won't be any previous ACKs.
1140 	 */
1141 	if ((tcp_flag_word(tcph) &
1142 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1143 		return NULL;
1144 
1145 	/* the 'triggering' ACK is at the tail of the queue, we have already
1146 	 * returned if it is the only packet in the flow. loop through the rest
1147 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1148 	 * triggering one.
1149 	 */
1150 	for (skb_check = flow->head;
1151 	     skb_check && skb_check != skb;
1152 	     skb_prev = skb_check, skb_check = skb_check->next) {
1153 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1154 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1155 					     sizeof(_tcph_check));
1156 
1157 		/* only TCP packets with matching 5-tuple are eligible, and only
1158 		 * drop safe headers
1159 		 */
1160 		if (!tcph_check || iph->version != iph_check->version ||
1161 		    tcph_check->source != tcph->source ||
1162 		    tcph_check->dest != tcph->dest)
1163 			continue;
1164 
1165 		if (iph_check->version == 4) {
1166 			if (iph_check->saddr != iph->saddr ||
1167 			    iph_check->daddr != iph->daddr)
1168 				continue;
1169 
1170 			seglen = ntohs(iph_check->tot_len) -
1171 				       (4 * iph_check->ihl);
1172 		} else if (iph_check->version == 6) {
1173 			ipv6h = (struct ipv6hdr *)iph;
1174 			ipv6h_check = (struct ipv6hdr *)iph_check;
1175 
1176 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1177 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1178 				continue;
1179 
1180 			seglen = ntohs(ipv6h_check->payload_len);
1181 		} else {
1182 			WARN_ON(1);  /* shouldn't happen */
1183 			continue;
1184 		}
1185 
1186 		/* If the ECE/CWR flags changed from the previous eligible
1187 		 * packet in the same flow, we should no longer be dropping that
1188 		 * previous packet as this would lose information.
1189 		 */
1190 		if (elig_ack && (tcp_flag_word(tcph_check) &
1191 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1192 			elig_ack = NULL;
1193 			elig_ack_prev = NULL;
1194 			num_found--;
1195 		}
1196 
1197 		/* Check TCP options and flags, don't drop ACKs with segment
1198 		 * data, and don't drop ACKs with a higher cumulative ACK
1199 		 * counter than the triggering packet. Check ACK seqno here to
1200 		 * avoid parsing SACK options of packets we are going to exclude
1201 		 * anyway.
1202 		 */
1203 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1204 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1205 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1206 			continue;
1207 
1208 		/* Check SACK options. The triggering packet must SACK more data
1209 		 * than the ACK under consideration, or SACK the same range but
1210 		 * have a larger cumulative ACK counter. The latter is a
1211 		 * pathological case, but is contained in the following check
1212 		 * anyway, just to be safe.
1213 		 */
1214 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1215 
1216 		if (sack_comp < 0 ||
1217 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1218 		     sack_comp == 0))
1219 			continue;
1220 
1221 		/* At this point we have found an eligible pure ACK to drop; if
1222 		 * we are in aggressive mode, we are done. Otherwise, keep
1223 		 * searching unless this is the second eligible ACK we
1224 		 * found.
1225 		 *
1226 		 * Since we want to drop ACK closest to the head of the queue,
1227 		 * save the first eligible ACK we find, even if we need to loop
1228 		 * again.
1229 		 */
1230 		if (!elig_ack) {
1231 			elig_ack = skb_check;
1232 			elig_ack_prev = skb_prev;
1233 			elig_flags = (tcp_flag_word(tcph_check)
1234 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1235 		}
1236 
1237 		if (num_found++ > 0)
1238 			goto found;
1239 	}
1240 
1241 	/* We made it through the queue without finding two eligible ACKs . If
1242 	 * we found a single eligible ACK we can drop it in aggressive mode if
1243 	 * we can guarantee that this does not interfere with ECN flag
1244 	 * information. We ensure this by dropping it only if the enqueued
1245 	 * packet is consecutive with the eligible ACK, and their flags match.
1246 	 */
1247 	if (elig_ack && aggressive && elig_ack->next == skb &&
1248 	    (elig_flags == (tcp_flag_word(tcph) &
1249 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1250 		goto found;
1251 
1252 	return NULL;
1253 
1254 found:
1255 	if (elig_ack_prev)
1256 		elig_ack_prev->next = elig_ack->next;
1257 	else
1258 		flow->head = elig_ack->next;
1259 
1260 	skb_mark_not_on_list(elig_ack);
1261 
1262 	return elig_ack;
1263 }
1264 
1265 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1266 {
1267 	avg -= avg >> shift;
1268 	avg += sample >> shift;
1269 	return avg;
1270 }
1271 
1272 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1273 {
1274 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1275 		len -= off;
1276 
1277 	if (q->max_netlen < len)
1278 		q->max_netlen = len;
1279 	if (q->min_netlen > len)
1280 		q->min_netlen = len;
1281 
1282 	len += q->rate_overhead;
1283 
1284 	if (len < q->rate_mpu)
1285 		len = q->rate_mpu;
1286 
1287 	if (q->atm_mode == CAKE_ATM_ATM) {
1288 		len += 47;
1289 		len /= 48;
1290 		len *= 53;
1291 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1292 		/* Add one byte per 64 bytes or part thereof.
1293 		 * This is conservative and easier to calculate than the
1294 		 * precise value.
1295 		 */
1296 		len += (len + 63) / 64;
1297 	}
1298 
1299 	if (q->max_adjlen < len)
1300 		q->max_adjlen = len;
1301 	if (q->min_adjlen > len)
1302 		q->min_adjlen = len;
1303 
1304 	return len;
1305 }
1306 
1307 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1308 {
1309 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1310 	unsigned int hdr_len, last_len = 0;
1311 	u32 off = skb_network_offset(skb);
1312 	u32 len = qdisc_pkt_len(skb);
1313 	u16 segs = 1;
1314 
1315 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1316 
1317 	if (!shinfo->gso_size)
1318 		return cake_calc_overhead(q, len, off);
1319 
1320 	/* borrowed from qdisc_pkt_len_init() */
1321 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1322 
1323 	/* + transport layer */
1324 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1325 						SKB_GSO_TCPV6))) {
1326 		const struct tcphdr *th;
1327 		struct tcphdr _tcphdr;
1328 
1329 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1330 					sizeof(_tcphdr), &_tcphdr);
1331 		if (likely(th))
1332 			hdr_len += __tcp_hdrlen(th);
1333 	} else {
1334 		struct udphdr _udphdr;
1335 
1336 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1337 				       sizeof(_udphdr), &_udphdr))
1338 			hdr_len += sizeof(struct udphdr);
1339 	}
1340 
1341 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1342 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1343 				    shinfo->gso_size);
1344 	else
1345 		segs = shinfo->gso_segs;
1346 
1347 	len = shinfo->gso_size + hdr_len;
1348 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1349 
1350 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1351 		cake_calc_overhead(q, last_len, off));
1352 }
1353 
1354 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1355 {
1356 	struct cake_heap_entry ii = q->overflow_heap[i];
1357 	struct cake_heap_entry jj = q->overflow_heap[j];
1358 
1359 	q->overflow_heap[i] = jj;
1360 	q->overflow_heap[j] = ii;
1361 
1362 	q->tins[ii.t].overflow_idx[ii.b] = j;
1363 	q->tins[jj.t].overflow_idx[jj.b] = i;
1364 }
1365 
1366 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1367 {
1368 	struct cake_heap_entry ii = q->overflow_heap[i];
1369 
1370 	return q->tins[ii.t].backlogs[ii.b];
1371 }
1372 
1373 static void cake_heapify(struct cake_sched_data *q, u16 i)
1374 {
1375 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1376 	u32 mb = cake_heap_get_backlog(q, i);
1377 	u32 m = i;
1378 
1379 	while (m < a) {
1380 		u32 l = m + m + 1;
1381 		u32 r = l + 1;
1382 
1383 		if (l < a) {
1384 			u32 lb = cake_heap_get_backlog(q, l);
1385 
1386 			if (lb > mb) {
1387 				m  = l;
1388 				mb = lb;
1389 			}
1390 		}
1391 
1392 		if (r < a) {
1393 			u32 rb = cake_heap_get_backlog(q, r);
1394 
1395 			if (rb > mb) {
1396 				m  = r;
1397 				mb = rb;
1398 			}
1399 		}
1400 
1401 		if (m != i) {
1402 			cake_heap_swap(q, i, m);
1403 			i = m;
1404 		} else {
1405 			break;
1406 		}
1407 	}
1408 }
1409 
1410 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1411 {
1412 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1413 		u16 p = (i - 1) >> 1;
1414 		u32 ib = cake_heap_get_backlog(q, i);
1415 		u32 pb = cake_heap_get_backlog(q, p);
1416 
1417 		if (ib > pb) {
1418 			cake_heap_swap(q, i, p);
1419 			i = p;
1420 		} else {
1421 			break;
1422 		}
1423 	}
1424 }
1425 
1426 static int cake_advance_shaper(struct cake_sched_data *q,
1427 			       struct cake_tin_data *b,
1428 			       struct sk_buff *skb,
1429 			       ktime_t now, bool drop)
1430 {
1431 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1432 
1433 	/* charge packet bandwidth to this tin
1434 	 * and to the global shaper.
1435 	 */
1436 	if (q->rate_ns) {
1437 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1438 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1439 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1440 
1441 		if (ktime_before(b->time_next_packet, now))
1442 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1443 							   tin_dur);
1444 
1445 		else if (ktime_before(b->time_next_packet,
1446 				      ktime_add_ns(now, tin_dur)))
1447 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1448 
1449 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1450 						   global_dur);
1451 		if (!drop)
1452 			q->failsafe_next_packet = \
1453 				ktime_add_ns(q->failsafe_next_packet,
1454 					     failsafe_dur);
1455 	}
1456 	return len;
1457 }
1458 
1459 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1460 {
1461 	struct cake_sched_data *q = qdisc_priv(sch);
1462 	ktime_t now = ktime_get();
1463 	u32 idx = 0, tin = 0, len;
1464 	struct cake_heap_entry qq;
1465 	struct cake_tin_data *b;
1466 	struct cake_flow *flow;
1467 	struct sk_buff *skb;
1468 
1469 	if (!q->overflow_timeout) {
1470 		int i;
1471 		/* Build fresh max-heap */
1472 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1473 			cake_heapify(q, i);
1474 	}
1475 	q->overflow_timeout = 65535;
1476 
1477 	/* select longest queue for pruning */
1478 	qq  = q->overflow_heap[0];
1479 	tin = qq.t;
1480 	idx = qq.b;
1481 
1482 	b = &q->tins[tin];
1483 	flow = &b->flows[idx];
1484 	skb = dequeue_head(flow);
1485 	if (unlikely(!skb)) {
1486 		/* heap has gone wrong, rebuild it next time */
1487 		q->overflow_timeout = 0;
1488 		return idx + (tin << 16);
1489 	}
1490 
1491 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1492 		b->unresponsive_flow_count++;
1493 
1494 	len = qdisc_pkt_len(skb);
1495 	q->buffer_used      -= skb->truesize;
1496 	b->backlogs[idx]    -= len;
1497 	b->tin_backlog      -= len;
1498 	sch->qstats.backlog -= len;
1499 	qdisc_tree_reduce_backlog(sch, 1, len);
1500 
1501 	flow->dropped++;
1502 	b->tin_dropped++;
1503 	sch->qstats.drops++;
1504 
1505 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1506 		cake_advance_shaper(q, b, skb, now, true);
1507 
1508 	__qdisc_drop(skb, to_free);
1509 	sch->q.qlen--;
1510 
1511 	cake_heapify(q, 0);
1512 
1513 	return idx + (tin << 16);
1514 }
1515 
1516 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1517 {
1518 	u8 dscp;
1519 
1520 	switch (skb->protocol) {
1521 	case htons(ETH_P_IP):
1522 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1523 		if (wash && dscp)
1524 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1525 		return dscp;
1526 
1527 	case htons(ETH_P_IPV6):
1528 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1529 		if (wash && dscp)
1530 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1531 		return dscp;
1532 
1533 	case htons(ETH_P_ARP):
1534 		return 0x38;  /* CS7 - Net Control */
1535 
1536 	default:
1537 		/* If there is no Diffserv field, treat as best-effort */
1538 		return 0;
1539 	}
1540 }
1541 
1542 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1543 					     struct sk_buff *skb)
1544 {
1545 	struct cake_sched_data *q = qdisc_priv(sch);
1546 	u32 tin;
1547 	u8 dscp;
1548 
1549 	/* Tin selection: Default to diffserv-based selection, allow overriding
1550 	 * using firewall marks or skb->priority.
1551 	 */
1552 	dscp = cake_handle_diffserv(skb,
1553 				    q->rate_flags & CAKE_FLAG_WASH);
1554 
1555 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1556 		tin = 0;
1557 
1558 	else if (q->rate_flags & CAKE_FLAG_FWMARK && /* use fw mark */
1559 		 skb->mark &&
1560 		 skb->mark <= q->tin_cnt)
1561 		tin = q->tin_order[skb->mark - 1];
1562 
1563 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1564 		 TC_H_MIN(skb->priority) > 0 &&
1565 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1566 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1567 
1568 	else {
1569 		tin = q->tin_index[dscp];
1570 
1571 		if (unlikely(tin >= q->tin_cnt))
1572 			tin = 0;
1573 	}
1574 
1575 	return &q->tins[tin];
1576 }
1577 
1578 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1579 			 struct sk_buff *skb, int flow_mode, int *qerr)
1580 {
1581 	struct cake_sched_data *q = qdisc_priv(sch);
1582 	struct tcf_proto *filter;
1583 	struct tcf_result res;
1584 	u16 flow = 0, host = 0;
1585 	int result;
1586 
1587 	filter = rcu_dereference_bh(q->filter_list);
1588 	if (!filter)
1589 		goto hash;
1590 
1591 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1592 	result = tcf_classify(skb, filter, &res, false);
1593 
1594 	if (result >= 0) {
1595 #ifdef CONFIG_NET_CLS_ACT
1596 		switch (result) {
1597 		case TC_ACT_STOLEN:
1598 		case TC_ACT_QUEUED:
1599 		case TC_ACT_TRAP:
1600 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1601 			/* fall through */
1602 		case TC_ACT_SHOT:
1603 			return 0;
1604 		}
1605 #endif
1606 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1607 			flow = TC_H_MIN(res.classid);
1608 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1609 			host = TC_H_MAJ(res.classid) >> 16;
1610 	}
1611 hash:
1612 	*t = cake_select_tin(sch, skb);
1613 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1614 }
1615 
1616 static void cake_reconfigure(struct Qdisc *sch);
1617 
1618 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1619 			struct sk_buff **to_free)
1620 {
1621 	struct cake_sched_data *q = qdisc_priv(sch);
1622 	int len = qdisc_pkt_len(skb);
1623 	int uninitialized_var(ret);
1624 	struct sk_buff *ack = NULL;
1625 	ktime_t now = ktime_get();
1626 	struct cake_tin_data *b;
1627 	struct cake_flow *flow;
1628 	u32 idx;
1629 
1630 	/* choose flow to insert into */
1631 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1632 	if (idx == 0) {
1633 		if (ret & __NET_XMIT_BYPASS)
1634 			qdisc_qstats_drop(sch);
1635 		__qdisc_drop(skb, to_free);
1636 		return ret;
1637 	}
1638 	idx--;
1639 	flow = &b->flows[idx];
1640 
1641 	/* ensure shaper state isn't stale */
1642 	if (!b->tin_backlog) {
1643 		if (ktime_before(b->time_next_packet, now))
1644 			b->time_next_packet = now;
1645 
1646 		if (!sch->q.qlen) {
1647 			if (ktime_before(q->time_next_packet, now)) {
1648 				q->failsafe_next_packet = now;
1649 				q->time_next_packet = now;
1650 			} else if (ktime_after(q->time_next_packet, now) &&
1651 				   ktime_after(q->failsafe_next_packet, now)) {
1652 				u64 next = \
1653 					min(ktime_to_ns(q->time_next_packet),
1654 					    ktime_to_ns(
1655 						   q->failsafe_next_packet));
1656 				sch->qstats.overlimits++;
1657 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1658 			}
1659 		}
1660 	}
1661 
1662 	if (unlikely(len > b->max_skblen))
1663 		b->max_skblen = len;
1664 
1665 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1666 		struct sk_buff *segs, *nskb;
1667 		netdev_features_t features = netif_skb_features(skb);
1668 		unsigned int slen = 0, numsegs = 0;
1669 
1670 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1671 		if (IS_ERR_OR_NULL(segs))
1672 			return qdisc_drop(skb, sch, to_free);
1673 
1674 		while (segs) {
1675 			nskb = segs->next;
1676 			skb_mark_not_on_list(segs);
1677 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1678 			cobalt_set_enqueue_time(segs, now);
1679 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1680 									  segs);
1681 			flow_queue_add(flow, segs);
1682 
1683 			sch->q.qlen++;
1684 			numsegs++;
1685 			slen += segs->len;
1686 			q->buffer_used += segs->truesize;
1687 			b->packets++;
1688 			segs = nskb;
1689 		}
1690 
1691 		/* stats */
1692 		b->bytes	    += slen;
1693 		b->backlogs[idx]    += slen;
1694 		b->tin_backlog      += slen;
1695 		sch->qstats.backlog += slen;
1696 		q->avg_window_bytes += slen;
1697 
1698 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1699 		consume_skb(skb);
1700 	} else {
1701 		/* not splitting */
1702 		cobalt_set_enqueue_time(skb, now);
1703 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1704 		flow_queue_add(flow, skb);
1705 
1706 		if (q->ack_filter)
1707 			ack = cake_ack_filter(q, flow);
1708 
1709 		if (ack) {
1710 			b->ack_drops++;
1711 			sch->qstats.drops++;
1712 			b->bytes += qdisc_pkt_len(ack);
1713 			len -= qdisc_pkt_len(ack);
1714 			q->buffer_used += skb->truesize - ack->truesize;
1715 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1716 				cake_advance_shaper(q, b, ack, now, true);
1717 
1718 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1719 			consume_skb(ack);
1720 		} else {
1721 			sch->q.qlen++;
1722 			q->buffer_used      += skb->truesize;
1723 		}
1724 
1725 		/* stats */
1726 		b->packets++;
1727 		b->bytes	    += len;
1728 		b->backlogs[idx]    += len;
1729 		b->tin_backlog      += len;
1730 		sch->qstats.backlog += len;
1731 		q->avg_window_bytes += len;
1732 	}
1733 
1734 	if (q->overflow_timeout)
1735 		cake_heapify_up(q, b->overflow_idx[idx]);
1736 
1737 	/* incoming bandwidth capacity estimate */
1738 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1739 		u64 packet_interval = \
1740 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1741 
1742 		if (packet_interval > NSEC_PER_SEC)
1743 			packet_interval = NSEC_PER_SEC;
1744 
1745 		/* filter out short-term bursts, eg. wifi aggregation */
1746 		q->avg_packet_interval = \
1747 			cake_ewma(q->avg_packet_interval,
1748 				  packet_interval,
1749 				  (packet_interval > q->avg_packet_interval ?
1750 					  2 : 8));
1751 
1752 		q->last_packet_time = now;
1753 
1754 		if (packet_interval > q->avg_packet_interval) {
1755 			u64 window_interval = \
1756 				ktime_to_ns(ktime_sub(now,
1757 						      q->avg_window_begin));
1758 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1759 
1760 			do_div(b, window_interval);
1761 			q->avg_peak_bandwidth =
1762 				cake_ewma(q->avg_peak_bandwidth, b,
1763 					  b > q->avg_peak_bandwidth ? 2 : 8);
1764 			q->avg_window_bytes = 0;
1765 			q->avg_window_begin = now;
1766 
1767 			if (ktime_after(now,
1768 					ktime_add_ms(q->last_reconfig_time,
1769 						     250))) {
1770 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1771 				cake_reconfigure(sch);
1772 			}
1773 		}
1774 	} else {
1775 		q->avg_window_bytes = 0;
1776 		q->last_packet_time = now;
1777 	}
1778 
1779 	/* flowchain */
1780 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1781 		struct cake_host *srchost = &b->hosts[flow->srchost];
1782 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1783 		u16 host_load = 1;
1784 
1785 		if (!flow->set) {
1786 			list_add_tail(&flow->flowchain, &b->new_flows);
1787 		} else {
1788 			b->decaying_flow_count--;
1789 			list_move_tail(&flow->flowchain, &b->new_flows);
1790 		}
1791 		flow->set = CAKE_SET_SPARSE;
1792 		b->sparse_flow_count++;
1793 
1794 		if (cake_dsrc(q->flow_mode))
1795 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1796 
1797 		if (cake_ddst(q->flow_mode))
1798 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1799 
1800 		flow->deficit = (b->flow_quantum *
1801 				 quantum_div[host_load]) >> 16;
1802 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1803 		struct cake_host *srchost = &b->hosts[flow->srchost];
1804 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1805 
1806 		/* this flow was empty, accounted as a sparse flow, but actually
1807 		 * in the bulk rotation.
1808 		 */
1809 		flow->set = CAKE_SET_BULK;
1810 		b->sparse_flow_count--;
1811 		b->bulk_flow_count++;
1812 
1813 		if (cake_dsrc(q->flow_mode))
1814 			srchost->srchost_bulk_flow_count++;
1815 
1816 		if (cake_ddst(q->flow_mode))
1817 			dsthost->dsthost_bulk_flow_count++;
1818 
1819 	}
1820 
1821 	if (q->buffer_used > q->buffer_max_used)
1822 		q->buffer_max_used = q->buffer_used;
1823 
1824 	if (q->buffer_used > q->buffer_limit) {
1825 		u32 dropped = 0;
1826 
1827 		while (q->buffer_used > q->buffer_limit) {
1828 			dropped++;
1829 			cake_drop(sch, to_free);
1830 		}
1831 		b->drop_overlimit += dropped;
1832 	}
1833 	return NET_XMIT_SUCCESS;
1834 }
1835 
1836 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1837 {
1838 	struct cake_sched_data *q = qdisc_priv(sch);
1839 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1840 	struct cake_flow *flow = &b->flows[q->cur_flow];
1841 	struct sk_buff *skb = NULL;
1842 	u32 len;
1843 
1844 	if (flow->head) {
1845 		skb = dequeue_head(flow);
1846 		len = qdisc_pkt_len(skb);
1847 		b->backlogs[q->cur_flow] -= len;
1848 		b->tin_backlog		 -= len;
1849 		sch->qstats.backlog      -= len;
1850 		q->buffer_used		 -= skb->truesize;
1851 		sch->q.qlen--;
1852 
1853 		if (q->overflow_timeout)
1854 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1855 	}
1856 	return skb;
1857 }
1858 
1859 /* Discard leftover packets from a tin no longer in use. */
1860 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1861 {
1862 	struct cake_sched_data *q = qdisc_priv(sch);
1863 	struct sk_buff *skb;
1864 
1865 	q->cur_tin = tin;
1866 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1867 		while (!!(skb = cake_dequeue_one(sch)))
1868 			kfree_skb(skb);
1869 }
1870 
1871 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1872 {
1873 	struct cake_sched_data *q = qdisc_priv(sch);
1874 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1875 	struct cake_host *srchost, *dsthost;
1876 	ktime_t now = ktime_get();
1877 	struct cake_flow *flow;
1878 	struct list_head *head;
1879 	bool first_flow = true;
1880 	struct sk_buff *skb;
1881 	u16 host_load;
1882 	u64 delay;
1883 	u32 len;
1884 
1885 begin:
1886 	if (!sch->q.qlen)
1887 		return NULL;
1888 
1889 	/* global hard shaper */
1890 	if (ktime_after(q->time_next_packet, now) &&
1891 	    ktime_after(q->failsafe_next_packet, now)) {
1892 		u64 next = min(ktime_to_ns(q->time_next_packet),
1893 			       ktime_to_ns(q->failsafe_next_packet));
1894 
1895 		sch->qstats.overlimits++;
1896 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1897 		return NULL;
1898 	}
1899 
1900 	/* Choose a class to work on. */
1901 	if (!q->rate_ns) {
1902 		/* In unlimited mode, can't rely on shaper timings, just balance
1903 		 * with DRR
1904 		 */
1905 		bool wrapped = false, empty = true;
1906 
1907 		while (b->tin_deficit < 0 ||
1908 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1909 			if (b->tin_deficit <= 0)
1910 				b->tin_deficit += b->tin_quantum_band;
1911 			if (b->sparse_flow_count + b->bulk_flow_count)
1912 				empty = false;
1913 
1914 			q->cur_tin++;
1915 			b++;
1916 			if (q->cur_tin >= q->tin_cnt) {
1917 				q->cur_tin = 0;
1918 				b = q->tins;
1919 
1920 				if (wrapped) {
1921 					/* It's possible for q->qlen to be
1922 					 * nonzero when we actually have no
1923 					 * packets anywhere.
1924 					 */
1925 					if (empty)
1926 						return NULL;
1927 				} else {
1928 					wrapped = true;
1929 				}
1930 			}
1931 		}
1932 	} else {
1933 		/* In shaped mode, choose:
1934 		 * - Highest-priority tin with queue and meeting schedule, or
1935 		 * - The earliest-scheduled tin with queue.
1936 		 */
1937 		ktime_t best_time = KTIME_MAX;
1938 		int tin, best_tin = 0;
1939 
1940 		for (tin = 0; tin < q->tin_cnt; tin++) {
1941 			b = q->tins + tin;
1942 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1943 				ktime_t time_to_pkt = \
1944 					ktime_sub(b->time_next_packet, now);
1945 
1946 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1947 				    ktime_compare(time_to_pkt,
1948 						  best_time) <= 0) {
1949 					best_time = time_to_pkt;
1950 					best_tin = tin;
1951 				}
1952 			}
1953 		}
1954 
1955 		q->cur_tin = best_tin;
1956 		b = q->tins + best_tin;
1957 
1958 		/* No point in going further if no packets to deliver. */
1959 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1960 			return NULL;
1961 	}
1962 
1963 retry:
1964 	/* service this class */
1965 	head = &b->decaying_flows;
1966 	if (!first_flow || list_empty(head)) {
1967 		head = &b->new_flows;
1968 		if (list_empty(head)) {
1969 			head = &b->old_flows;
1970 			if (unlikely(list_empty(head))) {
1971 				head = &b->decaying_flows;
1972 				if (unlikely(list_empty(head)))
1973 					goto begin;
1974 			}
1975 		}
1976 	}
1977 	flow = list_first_entry(head, struct cake_flow, flowchain);
1978 	q->cur_flow = flow - b->flows;
1979 	first_flow = false;
1980 
1981 	/* triple isolation (modified DRR++) */
1982 	srchost = &b->hosts[flow->srchost];
1983 	dsthost = &b->hosts[flow->dsthost];
1984 	host_load = 1;
1985 
1986 	/* flow isolation (DRR++) */
1987 	if (flow->deficit <= 0) {
1988 		/* Keep all flows with deficits out of the sparse and decaying
1989 		 * rotations.  No non-empty flow can go into the decaying
1990 		 * rotation, so they can't get deficits
1991 		 */
1992 		if (flow->set == CAKE_SET_SPARSE) {
1993 			if (flow->head) {
1994 				b->sparse_flow_count--;
1995 				b->bulk_flow_count++;
1996 
1997 				if (cake_dsrc(q->flow_mode))
1998 					srchost->srchost_bulk_flow_count++;
1999 
2000 				if (cake_ddst(q->flow_mode))
2001 					dsthost->dsthost_bulk_flow_count++;
2002 
2003 				flow->set = CAKE_SET_BULK;
2004 			} else {
2005 				/* we've moved it to the bulk rotation for
2006 				 * correct deficit accounting but we still want
2007 				 * to count it as a sparse flow, not a bulk one.
2008 				 */
2009 				flow->set = CAKE_SET_SPARSE_WAIT;
2010 			}
2011 		}
2012 
2013 		if (cake_dsrc(q->flow_mode))
2014 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2015 
2016 		if (cake_ddst(q->flow_mode))
2017 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2018 
2019 		WARN_ON(host_load > CAKE_QUEUES);
2020 
2021 		/* The shifted prandom_u32() is a way to apply dithering to
2022 		 * avoid accumulating roundoff errors
2023 		 */
2024 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2025 				  (prandom_u32() >> 16)) >> 16;
2026 		list_move_tail(&flow->flowchain, &b->old_flows);
2027 
2028 		goto retry;
2029 	}
2030 
2031 	/* Retrieve a packet via the AQM */
2032 	while (1) {
2033 		skb = cake_dequeue_one(sch);
2034 		if (!skb) {
2035 			/* this queue was actually empty */
2036 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2037 				b->unresponsive_flow_count--;
2038 
2039 			if (flow->cvars.p_drop || flow->cvars.count ||
2040 			    ktime_before(now, flow->cvars.drop_next)) {
2041 				/* keep in the flowchain until the state has
2042 				 * decayed to rest
2043 				 */
2044 				list_move_tail(&flow->flowchain,
2045 					       &b->decaying_flows);
2046 				if (flow->set == CAKE_SET_BULK) {
2047 					b->bulk_flow_count--;
2048 
2049 					if (cake_dsrc(q->flow_mode))
2050 						srchost->srchost_bulk_flow_count--;
2051 
2052 					if (cake_ddst(q->flow_mode))
2053 						dsthost->dsthost_bulk_flow_count--;
2054 
2055 					b->decaying_flow_count++;
2056 				} else if (flow->set == CAKE_SET_SPARSE ||
2057 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2058 					b->sparse_flow_count--;
2059 					b->decaying_flow_count++;
2060 				}
2061 				flow->set = CAKE_SET_DECAYING;
2062 			} else {
2063 				/* remove empty queue from the flowchain */
2064 				list_del_init(&flow->flowchain);
2065 				if (flow->set == CAKE_SET_SPARSE ||
2066 				    flow->set == CAKE_SET_SPARSE_WAIT)
2067 					b->sparse_flow_count--;
2068 				else if (flow->set == CAKE_SET_BULK) {
2069 					b->bulk_flow_count--;
2070 
2071 					if (cake_dsrc(q->flow_mode))
2072 						srchost->srchost_bulk_flow_count--;
2073 
2074 					if (cake_ddst(q->flow_mode))
2075 						dsthost->dsthost_bulk_flow_count--;
2076 
2077 				} else
2078 					b->decaying_flow_count--;
2079 
2080 				flow->set = CAKE_SET_NONE;
2081 			}
2082 			goto begin;
2083 		}
2084 
2085 		/* Last packet in queue may be marked, shouldn't be dropped */
2086 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2087 					(b->bulk_flow_count *
2088 					 !!(q->rate_flags &
2089 					    CAKE_FLAG_INGRESS))) ||
2090 		    !flow->head)
2091 			break;
2092 
2093 		/* drop this packet, get another one */
2094 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2095 			len = cake_advance_shaper(q, b, skb,
2096 						  now, true);
2097 			flow->deficit -= len;
2098 			b->tin_deficit -= len;
2099 		}
2100 		flow->dropped++;
2101 		b->tin_dropped++;
2102 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2103 		qdisc_qstats_drop(sch);
2104 		kfree_skb(skb);
2105 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2106 			goto retry;
2107 	}
2108 
2109 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2110 	qdisc_bstats_update(sch, skb);
2111 
2112 	/* collect delay stats */
2113 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2114 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2115 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2116 				  delay > b->peak_delay ? 2 : 8);
2117 	b->base_delay = cake_ewma(b->base_delay, delay,
2118 				  delay < b->base_delay ? 2 : 8);
2119 
2120 	len = cake_advance_shaper(q, b, skb, now, false);
2121 	flow->deficit -= len;
2122 	b->tin_deficit -= len;
2123 
2124 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2125 		u64 next = min(ktime_to_ns(q->time_next_packet),
2126 			       ktime_to_ns(q->failsafe_next_packet));
2127 
2128 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2129 	} else if (!sch->q.qlen) {
2130 		int i;
2131 
2132 		for (i = 0; i < q->tin_cnt; i++) {
2133 			if (q->tins[i].decaying_flow_count) {
2134 				ktime_t next = \
2135 					ktime_add_ns(now,
2136 						     q->tins[i].cparams.target);
2137 
2138 				qdisc_watchdog_schedule_ns(&q->watchdog,
2139 							   ktime_to_ns(next));
2140 				break;
2141 			}
2142 		}
2143 	}
2144 
2145 	if (q->overflow_timeout)
2146 		q->overflow_timeout--;
2147 
2148 	return skb;
2149 }
2150 
2151 static void cake_reset(struct Qdisc *sch)
2152 {
2153 	u32 c;
2154 
2155 	for (c = 0; c < CAKE_MAX_TINS; c++)
2156 		cake_clear_tin(sch, c);
2157 }
2158 
2159 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2160 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2161 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2162 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2163 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2164 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2165 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2166 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2167 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2168 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2169 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2170 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2171 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2172 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2173 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2174 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2175 };
2176 
2177 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2178 			  u64 target_ns, u64 rtt_est_ns)
2179 {
2180 	/* convert byte-rate into time-per-byte
2181 	 * so it will always unwedge in reasonable time.
2182 	 */
2183 	static const u64 MIN_RATE = 64;
2184 	u32 byte_target = mtu;
2185 	u64 byte_target_ns;
2186 	u8  rate_shft = 0;
2187 	u64 rate_ns = 0;
2188 
2189 	b->flow_quantum = 1514;
2190 	if (rate) {
2191 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2192 		rate_shft = 34;
2193 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2194 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2195 		while (!!(rate_ns >> 34)) {
2196 			rate_ns >>= 1;
2197 			rate_shft--;
2198 		}
2199 	} /* else unlimited, ie. zero delay */
2200 
2201 	b->tin_rate_bps  = rate;
2202 	b->tin_rate_ns   = rate_ns;
2203 	b->tin_rate_shft = rate_shft;
2204 
2205 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2206 
2207 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2208 	b->cparams.interval = max(rtt_est_ns +
2209 				     b->cparams.target - target_ns,
2210 				     b->cparams.target * 2);
2211 	b->cparams.mtu_time = byte_target_ns;
2212 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2213 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2214 }
2215 
2216 static int cake_config_besteffort(struct Qdisc *sch)
2217 {
2218 	struct cake_sched_data *q = qdisc_priv(sch);
2219 	struct cake_tin_data *b = &q->tins[0];
2220 	u32 mtu = psched_mtu(qdisc_dev(sch));
2221 	u64 rate = q->rate_bps;
2222 
2223 	q->tin_cnt = 1;
2224 
2225 	q->tin_index = besteffort;
2226 	q->tin_order = normal_order;
2227 
2228 	cake_set_rate(b, rate, mtu,
2229 		      us_to_ns(q->target), us_to_ns(q->interval));
2230 	b->tin_quantum_band = 65535;
2231 	b->tin_quantum_prio = 65535;
2232 
2233 	return 0;
2234 }
2235 
2236 static int cake_config_precedence(struct Qdisc *sch)
2237 {
2238 	/* convert high-level (user visible) parameters into internal format */
2239 	struct cake_sched_data *q = qdisc_priv(sch);
2240 	u32 mtu = psched_mtu(qdisc_dev(sch));
2241 	u64 rate = q->rate_bps;
2242 	u32 quantum1 = 256;
2243 	u32 quantum2 = 256;
2244 	u32 i;
2245 
2246 	q->tin_cnt = 8;
2247 	q->tin_index = precedence;
2248 	q->tin_order = normal_order;
2249 
2250 	for (i = 0; i < q->tin_cnt; i++) {
2251 		struct cake_tin_data *b = &q->tins[i];
2252 
2253 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2254 			      us_to_ns(q->interval));
2255 
2256 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2257 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2258 
2259 		/* calculate next class's parameters */
2260 		rate  *= 7;
2261 		rate >>= 3;
2262 
2263 		quantum1  *= 3;
2264 		quantum1 >>= 1;
2265 
2266 		quantum2  *= 7;
2267 		quantum2 >>= 3;
2268 	}
2269 
2270 	return 0;
2271 }
2272 
2273 /*	List of known Diffserv codepoints:
2274  *
2275  *	Least Effort (CS1)
2276  *	Best Effort (CS0)
2277  *	Max Reliability & LLT "Lo" (TOS1)
2278  *	Max Throughput (TOS2)
2279  *	Min Delay (TOS4)
2280  *	LLT "La" (TOS5)
2281  *	Assured Forwarding 1 (AF1x) - x3
2282  *	Assured Forwarding 2 (AF2x) - x3
2283  *	Assured Forwarding 3 (AF3x) - x3
2284  *	Assured Forwarding 4 (AF4x) - x3
2285  *	Precedence Class 2 (CS2)
2286  *	Precedence Class 3 (CS3)
2287  *	Precedence Class 4 (CS4)
2288  *	Precedence Class 5 (CS5)
2289  *	Precedence Class 6 (CS6)
2290  *	Precedence Class 7 (CS7)
2291  *	Voice Admit (VA)
2292  *	Expedited Forwarding (EF)
2293 
2294  *	Total 25 codepoints.
2295  */
2296 
2297 /*	List of traffic classes in RFC 4594:
2298  *		(roughly descending order of contended priority)
2299  *		(roughly ascending order of uncontended throughput)
2300  *
2301  *	Network Control (CS6,CS7)      - routing traffic
2302  *	Telephony (EF,VA)         - aka. VoIP streams
2303  *	Signalling (CS5)               - VoIP setup
2304  *	Multimedia Conferencing (AF4x) - aka. video calls
2305  *	Realtime Interactive (CS4)     - eg. games
2306  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2307  *	Broadcast Video (CS3)
2308  *	Low Latency Data (AF2x,TOS4)      - eg. database
2309  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2310  *	Standard Service (CS0 & unrecognised codepoints)
2311  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2312  *	Low Priority Data (CS1)           - eg. BitTorrent
2313 
2314  *	Total 12 traffic classes.
2315  */
2316 
2317 static int cake_config_diffserv8(struct Qdisc *sch)
2318 {
2319 /*	Pruned list of traffic classes for typical applications:
2320  *
2321  *		Network Control          (CS6, CS7)
2322  *		Minimum Latency          (EF, VA, CS5, CS4)
2323  *		Interactive Shell        (CS2, TOS1)
2324  *		Low Latency Transactions (AF2x, TOS4)
2325  *		Video Streaming          (AF4x, AF3x, CS3)
2326  *		Bog Standard             (CS0 etc.)
2327  *		High Throughput          (AF1x, TOS2)
2328  *		Background Traffic       (CS1)
2329  *
2330  *		Total 8 traffic classes.
2331  */
2332 
2333 	struct cake_sched_data *q = qdisc_priv(sch);
2334 	u32 mtu = psched_mtu(qdisc_dev(sch));
2335 	u64 rate = q->rate_bps;
2336 	u32 quantum1 = 256;
2337 	u32 quantum2 = 256;
2338 	u32 i;
2339 
2340 	q->tin_cnt = 8;
2341 
2342 	/* codepoint to class mapping */
2343 	q->tin_index = diffserv8;
2344 	q->tin_order = normal_order;
2345 
2346 	/* class characteristics */
2347 	for (i = 0; i < q->tin_cnt; i++) {
2348 		struct cake_tin_data *b = &q->tins[i];
2349 
2350 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2351 			      us_to_ns(q->interval));
2352 
2353 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2354 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2355 
2356 		/* calculate next class's parameters */
2357 		rate  *= 7;
2358 		rate >>= 3;
2359 
2360 		quantum1  *= 3;
2361 		quantum1 >>= 1;
2362 
2363 		quantum2  *= 7;
2364 		quantum2 >>= 3;
2365 	}
2366 
2367 	return 0;
2368 }
2369 
2370 static int cake_config_diffserv4(struct Qdisc *sch)
2371 {
2372 /*  Further pruned list of traffic classes for four-class system:
2373  *
2374  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2375  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2376  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2377  *	    Background Traffic (CS1)
2378  *
2379  *		Total 4 traffic classes.
2380  */
2381 
2382 	struct cake_sched_data *q = qdisc_priv(sch);
2383 	u32 mtu = psched_mtu(qdisc_dev(sch));
2384 	u64 rate = q->rate_bps;
2385 	u32 quantum = 1024;
2386 
2387 	q->tin_cnt = 4;
2388 
2389 	/* codepoint to class mapping */
2390 	q->tin_index = diffserv4;
2391 	q->tin_order = bulk_order;
2392 
2393 	/* class characteristics */
2394 	cake_set_rate(&q->tins[0], rate, mtu,
2395 		      us_to_ns(q->target), us_to_ns(q->interval));
2396 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397 		      us_to_ns(q->target), us_to_ns(q->interval));
2398 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2399 		      us_to_ns(q->target), us_to_ns(q->interval));
2400 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2401 		      us_to_ns(q->target), us_to_ns(q->interval));
2402 
2403 	/* priority weights */
2404 	q->tins[0].tin_quantum_prio = quantum;
2405 	q->tins[1].tin_quantum_prio = quantum >> 4;
2406 	q->tins[2].tin_quantum_prio = quantum << 2;
2407 	q->tins[3].tin_quantum_prio = quantum << 4;
2408 
2409 	/* bandwidth-sharing weights */
2410 	q->tins[0].tin_quantum_band = quantum;
2411 	q->tins[1].tin_quantum_band = quantum >> 4;
2412 	q->tins[2].tin_quantum_band = quantum >> 1;
2413 	q->tins[3].tin_quantum_band = quantum >> 2;
2414 
2415 	return 0;
2416 }
2417 
2418 static int cake_config_diffserv3(struct Qdisc *sch)
2419 {
2420 /*  Simplified Diffserv structure with 3 tins.
2421  *		Low Priority		(CS1)
2422  *		Best Effort
2423  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2424  */
2425 	struct cake_sched_data *q = qdisc_priv(sch);
2426 	u32 mtu = psched_mtu(qdisc_dev(sch));
2427 	u64 rate = q->rate_bps;
2428 	u32 quantum = 1024;
2429 
2430 	q->tin_cnt = 3;
2431 
2432 	/* codepoint to class mapping */
2433 	q->tin_index = diffserv3;
2434 	q->tin_order = bulk_order;
2435 
2436 	/* class characteristics */
2437 	cake_set_rate(&q->tins[0], rate, mtu,
2438 		      us_to_ns(q->target), us_to_ns(q->interval));
2439 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2440 		      us_to_ns(q->target), us_to_ns(q->interval));
2441 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2442 		      us_to_ns(q->target), us_to_ns(q->interval));
2443 
2444 	/* priority weights */
2445 	q->tins[0].tin_quantum_prio = quantum;
2446 	q->tins[1].tin_quantum_prio = quantum >> 4;
2447 	q->tins[2].tin_quantum_prio = quantum << 4;
2448 
2449 	/* bandwidth-sharing weights */
2450 	q->tins[0].tin_quantum_band = quantum;
2451 	q->tins[1].tin_quantum_band = quantum >> 4;
2452 	q->tins[2].tin_quantum_band = quantum >> 2;
2453 
2454 	return 0;
2455 }
2456 
2457 static void cake_reconfigure(struct Qdisc *sch)
2458 {
2459 	struct cake_sched_data *q = qdisc_priv(sch);
2460 	int c, ft;
2461 
2462 	switch (q->tin_mode) {
2463 	case CAKE_DIFFSERV_BESTEFFORT:
2464 		ft = cake_config_besteffort(sch);
2465 		break;
2466 
2467 	case CAKE_DIFFSERV_PRECEDENCE:
2468 		ft = cake_config_precedence(sch);
2469 		break;
2470 
2471 	case CAKE_DIFFSERV_DIFFSERV8:
2472 		ft = cake_config_diffserv8(sch);
2473 		break;
2474 
2475 	case CAKE_DIFFSERV_DIFFSERV4:
2476 		ft = cake_config_diffserv4(sch);
2477 		break;
2478 
2479 	case CAKE_DIFFSERV_DIFFSERV3:
2480 	default:
2481 		ft = cake_config_diffserv3(sch);
2482 		break;
2483 	}
2484 
2485 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2486 		cake_clear_tin(sch, c);
2487 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2488 	}
2489 
2490 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2491 	q->rate_shft = q->tins[ft].tin_rate_shft;
2492 
2493 	if (q->buffer_config_limit) {
2494 		q->buffer_limit = q->buffer_config_limit;
2495 	} else if (q->rate_bps) {
2496 		u64 t = q->rate_bps * q->interval;
2497 
2498 		do_div(t, USEC_PER_SEC / 4);
2499 		q->buffer_limit = max_t(u32, t, 4U << 20);
2500 	} else {
2501 		q->buffer_limit = ~0;
2502 	}
2503 
2504 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2505 
2506 	q->buffer_limit = min(q->buffer_limit,
2507 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2508 				  q->buffer_config_limit));
2509 }
2510 
2511 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2512 		       struct netlink_ext_ack *extack)
2513 {
2514 	struct cake_sched_data *q = qdisc_priv(sch);
2515 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2516 	int err;
2517 
2518 	if (!opt)
2519 		return -EINVAL;
2520 
2521 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2522 	if (err < 0)
2523 		return err;
2524 
2525 	if (tb[TCA_CAKE_NAT]) {
2526 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2527 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2528 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2529 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2530 #else
2531 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2532 				    "No conntrack support in kernel");
2533 		return -EOPNOTSUPP;
2534 #endif
2535 	}
2536 
2537 	if (tb[TCA_CAKE_BASE_RATE64])
2538 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2539 
2540 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2541 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2542 
2543 	if (tb[TCA_CAKE_WASH]) {
2544 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2545 			q->rate_flags |= CAKE_FLAG_WASH;
2546 		else
2547 			q->rate_flags &= ~CAKE_FLAG_WASH;
2548 	}
2549 
2550 	if (tb[TCA_CAKE_FLOW_MODE])
2551 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2552 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2553 					CAKE_FLOW_MASK));
2554 
2555 	if (tb[TCA_CAKE_ATM])
2556 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2557 
2558 	if (tb[TCA_CAKE_OVERHEAD]) {
2559 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2560 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2561 
2562 		q->max_netlen = 0;
2563 		q->max_adjlen = 0;
2564 		q->min_netlen = ~0;
2565 		q->min_adjlen = ~0;
2566 	}
2567 
2568 	if (tb[TCA_CAKE_RAW]) {
2569 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2570 
2571 		q->max_netlen = 0;
2572 		q->max_adjlen = 0;
2573 		q->min_netlen = ~0;
2574 		q->min_adjlen = ~0;
2575 	}
2576 
2577 	if (tb[TCA_CAKE_MPU])
2578 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2579 
2580 	if (tb[TCA_CAKE_RTT]) {
2581 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2582 
2583 		if (!q->interval)
2584 			q->interval = 1;
2585 	}
2586 
2587 	if (tb[TCA_CAKE_TARGET]) {
2588 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2589 
2590 		if (!q->target)
2591 			q->target = 1;
2592 	}
2593 
2594 	if (tb[TCA_CAKE_AUTORATE]) {
2595 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2596 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2597 		else
2598 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2599 	}
2600 
2601 	if (tb[TCA_CAKE_INGRESS]) {
2602 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2603 			q->rate_flags |= CAKE_FLAG_INGRESS;
2604 		else
2605 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2606 	}
2607 
2608 	if (tb[TCA_CAKE_ACK_FILTER])
2609 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2610 
2611 	if (tb[TCA_CAKE_MEMORY])
2612 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2613 
2614 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2615 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2616 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2617 		else
2618 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2619 	}
2620 
2621 	if (tb[TCA_CAKE_FWMARK]) {
2622 		if (!!nla_get_u32(tb[TCA_CAKE_FWMARK]))
2623 			q->rate_flags |= CAKE_FLAG_FWMARK;
2624 		else
2625 			q->rate_flags &= ~CAKE_FLAG_FWMARK;
2626 	}
2627 
2628 	if (q->tins) {
2629 		sch_tree_lock(sch);
2630 		cake_reconfigure(sch);
2631 		sch_tree_unlock(sch);
2632 	}
2633 
2634 	return 0;
2635 }
2636 
2637 static void cake_destroy(struct Qdisc *sch)
2638 {
2639 	struct cake_sched_data *q = qdisc_priv(sch);
2640 
2641 	qdisc_watchdog_cancel(&q->watchdog);
2642 	tcf_block_put(q->block);
2643 	kvfree(q->tins);
2644 }
2645 
2646 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2647 		     struct netlink_ext_ack *extack)
2648 {
2649 	struct cake_sched_data *q = qdisc_priv(sch);
2650 	int i, j, err;
2651 
2652 	sch->limit = 10240;
2653 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2654 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2655 
2656 	q->rate_bps = 0; /* unlimited by default */
2657 
2658 	q->interval = 100000; /* 100ms default */
2659 	q->target   =   5000; /* 5ms: codel RFC argues
2660 			       * for 5 to 10% of interval
2661 			       */
2662 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2663 	q->cur_tin = 0;
2664 	q->cur_flow  = 0;
2665 
2666 	qdisc_watchdog_init(&q->watchdog, sch);
2667 
2668 	if (opt) {
2669 		int err = cake_change(sch, opt, extack);
2670 
2671 		if (err)
2672 			return err;
2673 	}
2674 
2675 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2676 	if (err)
2677 		return err;
2678 
2679 	quantum_div[0] = ~0;
2680 	for (i = 1; i <= CAKE_QUEUES; i++)
2681 		quantum_div[i] = 65535 / i;
2682 
2683 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2684 			   GFP_KERNEL);
2685 	if (!q->tins)
2686 		goto nomem;
2687 
2688 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2689 		struct cake_tin_data *b = q->tins + i;
2690 
2691 		INIT_LIST_HEAD(&b->new_flows);
2692 		INIT_LIST_HEAD(&b->old_flows);
2693 		INIT_LIST_HEAD(&b->decaying_flows);
2694 		b->sparse_flow_count = 0;
2695 		b->bulk_flow_count = 0;
2696 		b->decaying_flow_count = 0;
2697 
2698 		for (j = 0; j < CAKE_QUEUES; j++) {
2699 			struct cake_flow *flow = b->flows + j;
2700 			u32 k = j * CAKE_MAX_TINS + i;
2701 
2702 			INIT_LIST_HEAD(&flow->flowchain);
2703 			cobalt_vars_init(&flow->cvars);
2704 
2705 			q->overflow_heap[k].t = i;
2706 			q->overflow_heap[k].b = j;
2707 			b->overflow_idx[j] = k;
2708 		}
2709 	}
2710 
2711 	cake_reconfigure(sch);
2712 	q->avg_peak_bandwidth = q->rate_bps;
2713 	q->min_netlen = ~0;
2714 	q->min_adjlen = ~0;
2715 	return 0;
2716 
2717 nomem:
2718 	cake_destroy(sch);
2719 	return -ENOMEM;
2720 }
2721 
2722 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2723 {
2724 	struct cake_sched_data *q = qdisc_priv(sch);
2725 	struct nlattr *opts;
2726 
2727 	opts = nla_nest_start(skb, TCA_OPTIONS);
2728 	if (!opts)
2729 		goto nla_put_failure;
2730 
2731 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2732 			      TCA_CAKE_PAD))
2733 		goto nla_put_failure;
2734 
2735 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2736 			q->flow_mode & CAKE_FLOW_MASK))
2737 		goto nla_put_failure;
2738 
2739 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2740 		goto nla_put_failure;
2741 
2742 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2743 		goto nla_put_failure;
2744 
2745 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2746 		goto nla_put_failure;
2747 
2748 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2749 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2750 		goto nla_put_failure;
2751 
2752 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2753 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2754 		goto nla_put_failure;
2755 
2756 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2757 		goto nla_put_failure;
2758 
2759 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2760 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2761 		goto nla_put_failure;
2762 
2763 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2764 		goto nla_put_failure;
2765 
2766 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2767 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2768 		goto nla_put_failure;
2769 
2770 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2771 		goto nla_put_failure;
2772 
2773 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2774 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2775 			goto nla_put_failure;
2776 
2777 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2778 		goto nla_put_failure;
2779 
2780 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2781 		goto nla_put_failure;
2782 
2783 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2784 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2785 		goto nla_put_failure;
2786 
2787 	if (nla_put_u32(skb, TCA_CAKE_FWMARK,
2788 			!!(q->rate_flags & CAKE_FLAG_FWMARK)))
2789 		goto nla_put_failure;
2790 
2791 	return nla_nest_end(skb, opts);
2792 
2793 nla_put_failure:
2794 	return -1;
2795 }
2796 
2797 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2798 {
2799 	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2800 	struct cake_sched_data *q = qdisc_priv(sch);
2801 	struct nlattr *tstats, *ts;
2802 	int i;
2803 
2804 	if (!stats)
2805 		return -1;
2806 
2807 #define PUT_STAT_U32(attr, data) do {				       \
2808 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2809 			goto nla_put_failure;			       \
2810 	} while (0)
2811 #define PUT_STAT_U64(attr, data) do {				       \
2812 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2813 					data, TCA_CAKE_STATS_PAD)) \
2814 			goto nla_put_failure;			       \
2815 	} while (0)
2816 
2817 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2818 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2819 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2820 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2821 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2822 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2823 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2824 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2825 
2826 #undef PUT_STAT_U32
2827 #undef PUT_STAT_U64
2828 
2829 	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2830 	if (!tstats)
2831 		goto nla_put_failure;
2832 
2833 #define PUT_TSTAT_U32(attr, data) do {					\
2834 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2835 			goto nla_put_failure;				\
2836 	} while (0)
2837 #define PUT_TSTAT_U64(attr, data) do {					\
2838 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2839 					data, TCA_CAKE_TIN_STATS_PAD))	\
2840 			goto nla_put_failure;				\
2841 	} while (0)
2842 
2843 	for (i = 0; i < q->tin_cnt; i++) {
2844 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2845 
2846 		ts = nla_nest_start(d->skb, i + 1);
2847 		if (!ts)
2848 			goto nla_put_failure;
2849 
2850 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2851 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2852 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2853 
2854 		PUT_TSTAT_U32(TARGET_US,
2855 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2856 		PUT_TSTAT_U32(INTERVAL_US,
2857 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2858 
2859 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2860 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2861 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2862 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2863 
2864 		PUT_TSTAT_U32(PEAK_DELAY_US,
2865 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2866 		PUT_TSTAT_U32(AVG_DELAY_US,
2867 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2868 		PUT_TSTAT_U32(BASE_DELAY_US,
2869 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2870 
2871 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2872 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2873 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2874 
2875 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2876 					    b->decaying_flow_count);
2877 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2878 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2879 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2880 
2881 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2882 		nla_nest_end(d->skb, ts);
2883 	}
2884 
2885 #undef PUT_TSTAT_U32
2886 #undef PUT_TSTAT_U64
2887 
2888 	nla_nest_end(d->skb, tstats);
2889 	return nla_nest_end(d->skb, stats);
2890 
2891 nla_put_failure:
2892 	nla_nest_cancel(d->skb, stats);
2893 	return -1;
2894 }
2895 
2896 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2897 {
2898 	return NULL;
2899 }
2900 
2901 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2902 {
2903 	return 0;
2904 }
2905 
2906 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2907 			       u32 classid)
2908 {
2909 	return 0;
2910 }
2911 
2912 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2913 {
2914 }
2915 
2916 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2917 					struct netlink_ext_ack *extack)
2918 {
2919 	struct cake_sched_data *q = qdisc_priv(sch);
2920 
2921 	if (cl)
2922 		return NULL;
2923 	return q->block;
2924 }
2925 
2926 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2927 			   struct sk_buff *skb, struct tcmsg *tcm)
2928 {
2929 	tcm->tcm_handle |= TC_H_MIN(cl);
2930 	return 0;
2931 }
2932 
2933 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2934 				 struct gnet_dump *d)
2935 {
2936 	struct cake_sched_data *q = qdisc_priv(sch);
2937 	const struct cake_flow *flow = NULL;
2938 	struct gnet_stats_queue qs = { 0 };
2939 	struct nlattr *stats;
2940 	u32 idx = cl - 1;
2941 
2942 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2943 		const struct cake_tin_data *b = \
2944 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2945 		const struct sk_buff *skb;
2946 
2947 		flow = &b->flows[idx % CAKE_QUEUES];
2948 
2949 		if (flow->head) {
2950 			sch_tree_lock(sch);
2951 			skb = flow->head;
2952 			while (skb) {
2953 				qs.qlen++;
2954 				skb = skb->next;
2955 			}
2956 			sch_tree_unlock(sch);
2957 		}
2958 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2959 		qs.drops = flow->dropped;
2960 	}
2961 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2962 		return -1;
2963 	if (flow) {
2964 		ktime_t now = ktime_get();
2965 
2966 		stats = nla_nest_start(d->skb, TCA_STATS_APP);
2967 		if (!stats)
2968 			return -1;
2969 
2970 #define PUT_STAT_U32(attr, data) do {				       \
2971 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2972 			goto nla_put_failure;			       \
2973 	} while (0)
2974 #define PUT_STAT_S32(attr, data) do {				       \
2975 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2976 			goto nla_put_failure;			       \
2977 	} while (0)
2978 
2979 		PUT_STAT_S32(DEFICIT, flow->deficit);
2980 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2981 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2982 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2983 		if (flow->cvars.p_drop) {
2984 			PUT_STAT_S32(BLUE_TIMER_US,
2985 				     ktime_to_us(
2986 					     ktime_sub(now,
2987 						     flow->cvars.blue_timer)));
2988 		}
2989 		if (flow->cvars.dropping) {
2990 			PUT_STAT_S32(DROP_NEXT_US,
2991 				     ktime_to_us(
2992 					     ktime_sub(now,
2993 						       flow->cvars.drop_next)));
2994 		}
2995 
2996 		if (nla_nest_end(d->skb, stats) < 0)
2997 			return -1;
2998 	}
2999 
3000 	return 0;
3001 
3002 nla_put_failure:
3003 	nla_nest_cancel(d->skb, stats);
3004 	return -1;
3005 }
3006 
3007 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3008 {
3009 	struct cake_sched_data *q = qdisc_priv(sch);
3010 	unsigned int i, j;
3011 
3012 	if (arg->stop)
3013 		return;
3014 
3015 	for (i = 0; i < q->tin_cnt; i++) {
3016 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3017 
3018 		for (j = 0; j < CAKE_QUEUES; j++) {
3019 			if (list_empty(&b->flows[j].flowchain) ||
3020 			    arg->count < arg->skip) {
3021 				arg->count++;
3022 				continue;
3023 			}
3024 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3025 				arg->stop = 1;
3026 				break;
3027 			}
3028 			arg->count++;
3029 		}
3030 	}
3031 }
3032 
3033 static const struct Qdisc_class_ops cake_class_ops = {
3034 	.leaf		=	cake_leaf,
3035 	.find		=	cake_find,
3036 	.tcf_block	=	cake_tcf_block,
3037 	.bind_tcf	=	cake_bind,
3038 	.unbind_tcf	=	cake_unbind,
3039 	.dump		=	cake_dump_class,
3040 	.dump_stats	=	cake_dump_class_stats,
3041 	.walk		=	cake_walk,
3042 };
3043 
3044 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3045 	.cl_ops		=	&cake_class_ops,
3046 	.id		=	"cake",
3047 	.priv_size	=	sizeof(struct cake_sched_data),
3048 	.enqueue	=	cake_enqueue,
3049 	.dequeue	=	cake_dequeue,
3050 	.peek		=	qdisc_peek_dequeued,
3051 	.init		=	cake_init,
3052 	.reset		=	cake_reset,
3053 	.destroy	=	cake_destroy,
3054 	.change		=	cake_change,
3055 	.dump		=	cake_dump,
3056 	.dump_stats	=	cake_dump_stats,
3057 	.owner		=	THIS_MODULE,
3058 };
3059 
3060 static int __init cake_module_init(void)
3061 {
3062 	return register_qdisc(&cake_qdisc_ops);
3063 }
3064 
3065 static void __exit cake_module_exit(void)
3066 {
3067 	unregister_qdisc(&cake_qdisc_ops);
3068 }
3069 
3070 module_init(cake_module_init)
3071 module_exit(cake_module_exit)
3072 MODULE_AUTHOR("Jonathan Morton");
3073 MODULE_LICENSE("Dual BSD/GPL");
3074 MODULE_DESCRIPTION("The CAKE shaper.");
3075