xref: /openbmc/linux/net/sched/sch_cake.c (revision e65e175b07bef5974045cc42238de99057669ca7)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum;
177 	s32	tin_deficit;
178 	u32	tin_backlog;
179 	u32	tin_dropped;
180 	u32	tin_ecn_mark;
181 
182 	u32	packets;
183 	u64	bytes;
184 
185 	u32	ack_drops;
186 
187 	/* moving averages */
188 	u64 avge_delay;
189 	u64 peak_delay;
190 	u64 base_delay;
191 
192 	/* hash function stats */
193 	u32	way_directs;
194 	u32	way_hits;
195 	u32	way_misses;
196 	u32	way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198 
199 struct cake_sched_data {
200 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
201 	struct tcf_block *block;
202 	struct cake_tin_data *tins;
203 
204 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205 	u16		overflow_timeout;
206 
207 	u16		tin_cnt;
208 	u8		tin_mode;
209 	u8		flow_mode;
210 	u8		ack_filter;
211 	u8		atm_mode;
212 
213 	u32		fwmark_mask;
214 	u16		fwmark_shft;
215 
216 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217 	u16		rate_shft;
218 	ktime_t		time_next_packet;
219 	ktime_t		failsafe_next_packet;
220 	u64		rate_ns;
221 	u64		rate_bps;
222 	u16		rate_flags;
223 	s16		rate_overhead;
224 	u16		rate_mpu;
225 	u64		interval;
226 	u64		target;
227 
228 	/* resource tracking */
229 	u32		buffer_used;
230 	u32		buffer_max_used;
231 	u32		buffer_limit;
232 	u32		buffer_config_limit;
233 
234 	/* indices for dequeue */
235 	u16		cur_tin;
236 	u16		cur_flow;
237 
238 	struct qdisc_watchdog watchdog;
239 	const u8	*tin_index;
240 	const u8	*tin_order;
241 
242 	/* bandwidth capacity estimate */
243 	ktime_t		last_packet_time;
244 	ktime_t		avg_window_begin;
245 	u64		avg_packet_interval;
246 	u64		avg_window_bytes;
247 	u64		avg_peak_bandwidth;
248 	ktime_t		last_reconfig_time;
249 
250 	/* packet length stats */
251 	u32		avg_netoff;
252 	u16		max_netlen;
253 	u16		max_adjlen;
254 	u16		min_netlen;
255 	u16		min_adjlen;
256 };
257 
258 enum {
259 	CAKE_FLAG_OVERHEAD	   = BIT(0),
260 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261 	CAKE_FLAG_INGRESS	   = BIT(2),
262 	CAKE_FLAG_WASH		   = BIT(3),
263 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
264 };
265 
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271 
272 struct cobalt_skb_cb {
273 	ktime_t enqueue_time;
274 	u32     adjusted_len;
275 };
276 
277 static u64 us_to_ns(u64 us)
278 {
279 	return us * NSEC_PER_USEC;
280 }
281 
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287 
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290 	return get_cobalt_cb(skb)->enqueue_time;
291 }
292 
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294 				    ktime_t now)
295 {
296 	get_cobalt_cb(skb)->enqueue_time = now;
297 }
298 
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300 
301 /* Diffserv lookup tables */
302 
303 static const u8 precedence[] = {
304 	0, 0, 0, 0, 0, 0, 0, 0,
305 	1, 1, 1, 1, 1, 1, 1, 1,
306 	2, 2, 2, 2, 2, 2, 2, 2,
307 	3, 3, 3, 3, 3, 3, 3, 3,
308 	4, 4, 4, 4, 4, 4, 4, 4,
309 	5, 5, 5, 5, 5, 5, 5, 5,
310 	6, 6, 6, 6, 6, 6, 6, 6,
311 	7, 7, 7, 7, 7, 7, 7, 7,
312 };
313 
314 static const u8 diffserv8[] = {
315 	2, 0, 1, 2, 4, 2, 2, 2,
316 	1, 2, 1, 2, 1, 2, 1, 2,
317 	5, 2, 4, 2, 4, 2, 4, 2,
318 	3, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 2, 2, 6, 2, 6, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 };
324 
325 static const u8 diffserv4[] = {
326 	0, 1, 0, 0, 2, 0, 0, 0,
327 	1, 0, 0, 0, 0, 0, 0, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 0, 0, 3, 0, 3, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 };
335 
336 static const u8 diffserv3[] = {
337 	0, 1, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 2, 0, 2, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 besteffort[] = {
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 /* tin priority order for stats dumping */
359 
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362 
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365 
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371 
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374 	u32 invsqrt, invsqrt2;
375 	u64 val;
376 
377 	invsqrt = vars->rec_inv_sqrt;
378 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380 
381 	val >>= 2; /* avoid overflow in following multiply */
382 	val = (val * invsqrt) >> (32 - 2 + 1);
383 
384 	vars->rec_inv_sqrt = val;
385 }
386 
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389 	if (vars->count < REC_INV_SQRT_CACHE)
390 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391 	else
392 		cobalt_newton_step(vars);
393 }
394 
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404 
405 static void cobalt_cache_init(void)
406 {
407 	struct cobalt_vars v;
408 
409 	memset(&v, 0, sizeof(v));
410 	v.rec_inv_sqrt = ~0U;
411 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412 
413 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 
419 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420 	}
421 }
422 
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425 	memset(vars, 0, sizeof(*vars));
426 
427 	if (!cobalt_rec_inv_sqrt_cache[0]) {
428 		cobalt_cache_init();
429 		cobalt_rec_inv_sqrt_cache[0] = ~0;
430 	}
431 }
432 
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438 			      u64 interval,
439 			      u32 rec_inv_sqrt)
440 {
441 	return ktime_add_ns(t, reciprocal_scale(interval,
442 						rec_inv_sqrt));
443 }
444 
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449 			      struct cobalt_params *p,
450 			      ktime_t now)
451 {
452 	bool up = false;
453 
454 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455 		up = !vars->p_drop;
456 		vars->p_drop += p->p_inc;
457 		if (vars->p_drop < p->p_inc)
458 			vars->p_drop = ~0;
459 		vars->blue_timer = now;
460 	}
461 	vars->dropping = true;
462 	vars->drop_next = now;
463 	if (!vars->count)
464 		vars->count = 1;
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			vars->p_drop = 0;
482 		else
483 			vars->p_drop -= p->p_dec;
484 		vars->blue_timer = now;
485 		down = !vars->p_drop;
486 	}
487 	vars->dropping = false;
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		vars->count--;
491 		cobalt_invsqrt(vars);
492 		vars->drop_next = cobalt_control(vars->drop_next,
493 						 p->interval,
494 						 vars->rec_inv_sqrt);
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504 			       struct cobalt_params *p,
505 			       ktime_t now,
506 			       struct sk_buff *skb,
507 			       u32 bulk_flows)
508 {
509 	bool next_due, over_target, drop = false;
510 	ktime_t schedule;
511 	u64 sojourn;
512 
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527 
528 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529 	schedule = ktime_sub(now, vars->drop_next);
530 	over_target = sojourn > p->target &&
531 		      sojourn > p->mtu_time * bulk_flows * 2 &&
532 		      sojourn > p->mtu_time * 4;
533 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
534 
535 	vars->ecn_marked = false;
536 
537 	if (over_target) {
538 		if (!vars->dropping) {
539 			vars->dropping = true;
540 			vars->drop_next = cobalt_control(now,
541 							 p->interval,
542 							 vars->rec_inv_sqrt);
543 		}
544 		if (!vars->count)
545 			vars->count = 1;
546 	} else if (vars->dropping) {
547 		vars->dropping = false;
548 	}
549 
550 	if (next_due && vars->dropping) {
551 		/* Use ECN mark if possible, otherwise drop */
552 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553 
554 		vars->count++;
555 		if (!vars->count)
556 			vars->count--;
557 		cobalt_invsqrt(vars);
558 		vars->drop_next = cobalt_control(vars->drop_next,
559 						 p->interval,
560 						 vars->rec_inv_sqrt);
561 		schedule = ktime_sub(now, vars->drop_next);
562 	} else {
563 		while (next_due) {
564 			vars->count--;
565 			cobalt_invsqrt(vars);
566 			vars->drop_next = cobalt_control(vars->drop_next,
567 							 p->interval,
568 							 vars->rec_inv_sqrt);
569 			schedule = ktime_sub(now, vars->drop_next);
570 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
571 		}
572 	}
573 
574 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
575 	if (vars->p_drop)
576 		drop |= (get_random_u32() < vars->p_drop);
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && !drop)
582 		vars->drop_next = now;
583 
584 	return drop;
585 }
586 
587 static bool cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct, upd = false;
593 	__be32 ip;
594 
595 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
596 		return false;
597 
598 	if (!nf_ct_get_tuple_skb(&tuple, skb))
599 		return false;
600 
601 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 	if (ip != keys->addrs.v4addrs.src) {
603 		keys->addrs.v4addrs.src = ip;
604 		upd = true;
605 	}
606 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607 	if (ip != keys->addrs.v4addrs.dst) {
608 		keys->addrs.v4addrs.dst = ip;
609 		upd = true;
610 	}
611 
612 	if (keys->ports.ports) {
613 		__be16 port;
614 
615 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
616 		if (port != keys->ports.src) {
617 			keys->ports.src = port;
618 			upd = true;
619 		}
620 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
621 		if (port != keys->ports.dst) {
622 			port = keys->ports.dst;
623 			upd = true;
624 		}
625 	}
626 	return upd;
627 #else
628 	return false;
629 #endif
630 }
631 
632 /* Cake has several subtle multiple bit settings. In these cases you
633  *  would be matching triple isolate mode as well.
634  */
635 
636 static bool cake_dsrc(int flow_mode)
637 {
638 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640 
641 static bool cake_ddst(int flow_mode)
642 {
643 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645 
646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
647 		     int flow_mode, u16 flow_override, u16 host_override)
648 {
649 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
650 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
651 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
652 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
653 	u16 reduced_hash, srchost_idx, dsthost_idx;
654 	struct flow_keys keys, host_keys;
655 	bool use_skbhash = skb->l4_hash;
656 
657 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
658 		return 0;
659 
660 	/* If both overrides are set, or we can use the SKB hash and nat mode is
661 	 * disabled, we can skip packet dissection entirely. If nat mode is
662 	 * enabled there's another check below after doing the conntrack lookup.
663 	 */
664 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
665 		goto skip_hash;
666 
667 	skb_flow_dissect_flow_keys(skb, &keys,
668 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
669 
670 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
671 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
672 		use_skbhash = false;
673 
674 	/* If we can still use the SKB hash and don't need the host hash, we can
675 	 * skip the rest of the hashing procedure
676 	 */
677 	if (use_skbhash && !hash_hosts)
678 		goto skip_hash;
679 
680 	/* flow_hash_from_keys() sorts the addresses by value, so we have
681 	 * to preserve their order in a separate data structure to treat
682 	 * src and dst host addresses as independently selectable.
683 	 */
684 	host_keys = keys;
685 	host_keys.ports.ports     = 0;
686 	host_keys.basic.ip_proto  = 0;
687 	host_keys.keyid.keyid     = 0;
688 	host_keys.tags.flow_label = 0;
689 
690 	switch (host_keys.control.addr_type) {
691 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
692 		host_keys.addrs.v4addrs.src = 0;
693 		dsthost_hash = flow_hash_from_keys(&host_keys);
694 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
695 		host_keys.addrs.v4addrs.dst = 0;
696 		srchost_hash = flow_hash_from_keys(&host_keys);
697 		break;
698 
699 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
700 		memset(&host_keys.addrs.v6addrs.src, 0,
701 		       sizeof(host_keys.addrs.v6addrs.src));
702 		dsthost_hash = flow_hash_from_keys(&host_keys);
703 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
704 		memset(&host_keys.addrs.v6addrs.dst, 0,
705 		       sizeof(host_keys.addrs.v6addrs.dst));
706 		srchost_hash = flow_hash_from_keys(&host_keys);
707 		break;
708 
709 	default:
710 		dsthost_hash = 0;
711 		srchost_hash = 0;
712 	}
713 
714 	/* This *must* be after the above switch, since as a
715 	 * side-effect it sorts the src and dst addresses.
716 	 */
717 	if (hash_flows && !use_skbhash)
718 		flow_hash = flow_hash_from_keys(&keys);
719 
720 skip_hash:
721 	if (flow_override)
722 		flow_hash = flow_override - 1;
723 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
724 		flow_hash = skb->hash;
725 	if (host_override) {
726 		dsthost_hash = host_override - 1;
727 		srchost_hash = host_override - 1;
728 	}
729 
730 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
731 		if (flow_mode & CAKE_FLOW_SRC_IP)
732 			flow_hash ^= srchost_hash;
733 
734 		if (flow_mode & CAKE_FLOW_DST_IP)
735 			flow_hash ^= dsthost_hash;
736 	}
737 
738 	reduced_hash = flow_hash % CAKE_QUEUES;
739 
740 	/* set-associative hashing */
741 	/* fast path if no hash collision (direct lookup succeeds) */
742 	if (likely(q->tags[reduced_hash] == flow_hash &&
743 		   q->flows[reduced_hash].set)) {
744 		q->way_directs++;
745 	} else {
746 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
747 		u32 outer_hash = reduced_hash - inner_hash;
748 		bool allocate_src = false;
749 		bool allocate_dst = false;
750 		u32 i, k;
751 
752 		/* check if any active queue in the set is reserved for
753 		 * this flow.
754 		 */
755 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
756 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
757 			if (q->tags[outer_hash + k] == flow_hash) {
758 				if (i)
759 					q->way_hits++;
760 
761 				if (!q->flows[outer_hash + k].set) {
762 					/* need to increment host refcnts */
763 					allocate_src = cake_dsrc(flow_mode);
764 					allocate_dst = cake_ddst(flow_mode);
765 				}
766 
767 				goto found;
768 			}
769 		}
770 
771 		/* no queue is reserved for this flow, look for an
772 		 * empty one.
773 		 */
774 		for (i = 0; i < CAKE_SET_WAYS;
775 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
776 			if (!q->flows[outer_hash + k].set) {
777 				q->way_misses++;
778 				allocate_src = cake_dsrc(flow_mode);
779 				allocate_dst = cake_ddst(flow_mode);
780 				goto found;
781 			}
782 		}
783 
784 		/* With no empty queues, default to the original
785 		 * queue, accept the collision, update the host tags.
786 		 */
787 		q->way_collisions++;
788 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
789 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
790 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
791 		}
792 		allocate_src = cake_dsrc(flow_mode);
793 		allocate_dst = cake_ddst(flow_mode);
794 found:
795 		/* reserve queue for future packets in same flow */
796 		reduced_hash = outer_hash + k;
797 		q->tags[reduced_hash] = flow_hash;
798 
799 		if (allocate_src) {
800 			srchost_idx = srchost_hash % CAKE_QUEUES;
801 			inner_hash = srchost_idx % CAKE_SET_WAYS;
802 			outer_hash = srchost_idx - inner_hash;
803 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
804 				i++, k = (k + 1) % CAKE_SET_WAYS) {
805 				if (q->hosts[outer_hash + k].srchost_tag ==
806 				    srchost_hash)
807 					goto found_src;
808 			}
809 			for (i = 0; i < CAKE_SET_WAYS;
810 				i++, k = (k + 1) % CAKE_SET_WAYS) {
811 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
812 					break;
813 			}
814 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
815 found_src:
816 			srchost_idx = outer_hash + k;
817 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
818 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
819 			q->flows[reduced_hash].srchost = srchost_idx;
820 		}
821 
822 		if (allocate_dst) {
823 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
824 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
825 			outer_hash = dsthost_idx - inner_hash;
826 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
827 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
828 				if (q->hosts[outer_hash + k].dsthost_tag ==
829 				    dsthost_hash)
830 					goto found_dst;
831 			}
832 			for (i = 0; i < CAKE_SET_WAYS;
833 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
834 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
835 					break;
836 			}
837 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
838 found_dst:
839 			dsthost_idx = outer_hash + k;
840 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
841 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
842 			q->flows[reduced_hash].dsthost = dsthost_idx;
843 		}
844 	}
845 
846 	return reduced_hash;
847 }
848 
849 /* helper functions : might be changed when/if skb use a standard list_head */
850 /* remove one skb from head of slot queue */
851 
852 static struct sk_buff *dequeue_head(struct cake_flow *flow)
853 {
854 	struct sk_buff *skb = flow->head;
855 
856 	if (skb) {
857 		flow->head = skb->next;
858 		skb_mark_not_on_list(skb);
859 	}
860 
861 	return skb;
862 }
863 
864 /* add skb to flow queue (tail add) */
865 
866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
867 {
868 	if (!flow->head)
869 		flow->head = skb;
870 	else
871 		flow->tail->next = skb;
872 	flow->tail = skb;
873 	skb->next = NULL;
874 }
875 
876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
877 				    struct ipv6hdr *buf)
878 {
879 	unsigned int offset = skb_network_offset(skb);
880 	struct iphdr *iph;
881 
882 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
883 
884 	if (!iph)
885 		return NULL;
886 
887 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
888 		return skb_header_pointer(skb, offset + iph->ihl * 4,
889 					  sizeof(struct ipv6hdr), buf);
890 
891 	else if (iph->version == 4)
892 		return iph;
893 
894 	else if (iph->version == 6)
895 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
896 					  buf);
897 
898 	return NULL;
899 }
900 
901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
902 				      void *buf, unsigned int bufsize)
903 {
904 	unsigned int offset = skb_network_offset(skb);
905 	const struct ipv6hdr *ipv6h;
906 	const struct tcphdr *tcph;
907 	const struct iphdr *iph;
908 	struct ipv6hdr _ipv6h;
909 	struct tcphdr _tcph;
910 
911 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
912 
913 	if (!ipv6h)
914 		return NULL;
915 
916 	if (ipv6h->version == 4) {
917 		iph = (struct iphdr *)ipv6h;
918 		offset += iph->ihl * 4;
919 
920 		/* special-case 6in4 tunnelling, as that is a common way to get
921 		 * v6 connectivity in the home
922 		 */
923 		if (iph->protocol == IPPROTO_IPV6) {
924 			ipv6h = skb_header_pointer(skb, offset,
925 						   sizeof(_ipv6h), &_ipv6h);
926 
927 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
928 				return NULL;
929 
930 			offset += sizeof(struct ipv6hdr);
931 
932 		} else if (iph->protocol != IPPROTO_TCP) {
933 			return NULL;
934 		}
935 
936 	} else if (ipv6h->version == 6) {
937 		if (ipv6h->nexthdr != IPPROTO_TCP)
938 			return NULL;
939 
940 		offset += sizeof(struct ipv6hdr);
941 	} else {
942 		return NULL;
943 	}
944 
945 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
946 	if (!tcph || tcph->doff < 5)
947 		return NULL;
948 
949 	return skb_header_pointer(skb, offset,
950 				  min(__tcp_hdrlen(tcph), bufsize), buf);
951 }
952 
953 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
954 				   int code, int *oplen)
955 {
956 	/* inspired by tcp_parse_options in tcp_input.c */
957 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
958 	const u8 *ptr = (const u8 *)(tcph + 1);
959 
960 	while (length > 0) {
961 		int opcode = *ptr++;
962 		int opsize;
963 
964 		if (opcode == TCPOPT_EOL)
965 			break;
966 		if (opcode == TCPOPT_NOP) {
967 			length--;
968 			continue;
969 		}
970 		if (length < 2)
971 			break;
972 		opsize = *ptr++;
973 		if (opsize < 2 || opsize > length)
974 			break;
975 
976 		if (opcode == code) {
977 			*oplen = opsize;
978 			return ptr;
979 		}
980 
981 		ptr += opsize - 2;
982 		length -= opsize;
983 	}
984 
985 	return NULL;
986 }
987 
988 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
989  * bytes than the other. In the case where both sequences ACKs bytes that the
990  * other doesn't, A is considered greater. DSACKs in A also makes A be
991  * considered greater.
992  *
993  * @return -1, 0 or 1 as normal compare functions
994  */
995 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
996 				  const struct tcphdr *tcph_b)
997 {
998 	const struct tcp_sack_block_wire *sack_a, *sack_b;
999 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1000 	u32 bytes_a = 0, bytes_b = 0;
1001 	int oplen_a, oplen_b;
1002 	bool first = true;
1003 
1004 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1005 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1006 
1007 	/* pointers point to option contents */
1008 	oplen_a -= TCPOLEN_SACK_BASE;
1009 	oplen_b -= TCPOLEN_SACK_BASE;
1010 
1011 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1012 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1013 		return -1;
1014 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1015 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1016 		return 1;
1017 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1018 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1019 		return 0;
1020 
1021 	while (oplen_a >= sizeof(*sack_a)) {
1022 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1023 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1024 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1025 		int oplen_tmp = oplen_b;
1026 		bool found = false;
1027 
1028 		/* DSACK; always considered greater to prevent dropping */
1029 		if (before(start_a, ack_seq_a))
1030 			return -1;
1031 
1032 		bytes_a += end_a - start_a;
1033 
1034 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1035 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1036 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1037 
1038 			/* first time through we count the total size */
1039 			if (first)
1040 				bytes_b += end_b - start_b;
1041 
1042 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1043 				found = true;
1044 				if (!first)
1045 					break;
1046 			}
1047 			oplen_tmp -= sizeof(*sack_tmp);
1048 			sack_tmp++;
1049 		}
1050 
1051 		if (!found)
1052 			return -1;
1053 
1054 		oplen_a -= sizeof(*sack_a);
1055 		sack_a++;
1056 		first = false;
1057 	}
1058 
1059 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1060 	 * either the SACKs are equal, or B SACKs more bytes.
1061 	 */
1062 	return bytes_b > bytes_a ? 1 : 0;
1063 }
1064 
1065 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1066 				 u32 *tsval, u32 *tsecr)
1067 {
1068 	const u8 *ptr;
1069 	int opsize;
1070 
1071 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1072 
1073 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1074 		*tsval = get_unaligned_be32(ptr);
1075 		*tsecr = get_unaligned_be32(ptr + 4);
1076 	}
1077 }
1078 
1079 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1080 			       u32 tstamp_new, u32 tsecr_new)
1081 {
1082 	/* inspired by tcp_parse_options in tcp_input.c */
1083 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1084 	const u8 *ptr = (const u8 *)(tcph + 1);
1085 	u32 tstamp, tsecr;
1086 
1087 	/* 3 reserved flags must be unset to avoid future breakage
1088 	 * ACK must be set
1089 	 * ECE/CWR are handled separately
1090 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1091 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1092 	 * 0x00C00000 = CWR/ECE (handled separately)
1093 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1094 	 */
1095 	if (((tcp_flag_word(tcph) &
1096 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1097 		return false;
1098 
1099 	while (length > 0) {
1100 		int opcode = *ptr++;
1101 		int opsize;
1102 
1103 		if (opcode == TCPOPT_EOL)
1104 			break;
1105 		if (opcode == TCPOPT_NOP) {
1106 			length--;
1107 			continue;
1108 		}
1109 		if (length < 2)
1110 			break;
1111 		opsize = *ptr++;
1112 		if (opsize < 2 || opsize > length)
1113 			break;
1114 
1115 		switch (opcode) {
1116 		case TCPOPT_MD5SIG: /* doesn't influence state */
1117 			break;
1118 
1119 		case TCPOPT_SACK: /* stricter checking performed later */
1120 			if (opsize % 8 != 2)
1121 				return false;
1122 			break;
1123 
1124 		case TCPOPT_TIMESTAMP:
1125 			/* only drop timestamps lower than new */
1126 			if (opsize != TCPOLEN_TIMESTAMP)
1127 				return false;
1128 			tstamp = get_unaligned_be32(ptr);
1129 			tsecr = get_unaligned_be32(ptr + 4);
1130 			if (after(tstamp, tstamp_new) ||
1131 			    after(tsecr, tsecr_new))
1132 				return false;
1133 			break;
1134 
1135 		case TCPOPT_MSS:  /* these should only be set on SYN */
1136 		case TCPOPT_WINDOW:
1137 		case TCPOPT_SACK_PERM:
1138 		case TCPOPT_FASTOPEN:
1139 		case TCPOPT_EXP:
1140 		default: /* don't drop if any unknown options are present */
1141 			return false;
1142 		}
1143 
1144 		ptr += opsize - 2;
1145 		length -= opsize;
1146 	}
1147 
1148 	return true;
1149 }
1150 
1151 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1152 				       struct cake_flow *flow)
1153 {
1154 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1155 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1156 	struct sk_buff *skb_check, *skb_prev = NULL;
1157 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1158 	unsigned char _tcph[64], _tcph_check[64];
1159 	const struct tcphdr *tcph, *tcph_check;
1160 	const struct iphdr *iph, *iph_check;
1161 	struct ipv6hdr _iph, _iph_check;
1162 	const struct sk_buff *skb;
1163 	int seglen, num_found = 0;
1164 	u32 tstamp = 0, tsecr = 0;
1165 	__be32 elig_flags = 0;
1166 	int sack_comp;
1167 
1168 	/* no other possible ACKs to filter */
1169 	if (flow->head == flow->tail)
1170 		return NULL;
1171 
1172 	skb = flow->tail;
1173 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1174 	iph = cake_get_iphdr(skb, &_iph);
1175 	if (!tcph)
1176 		return NULL;
1177 
1178 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1179 
1180 	/* the 'triggering' packet need only have the ACK flag set.
1181 	 * also check that SYN is not set, as there won't be any previous ACKs.
1182 	 */
1183 	if ((tcp_flag_word(tcph) &
1184 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1185 		return NULL;
1186 
1187 	/* the 'triggering' ACK is at the tail of the queue, we have already
1188 	 * returned if it is the only packet in the flow. loop through the rest
1189 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1190 	 * triggering one.
1191 	 */
1192 	for (skb_check = flow->head;
1193 	     skb_check && skb_check != skb;
1194 	     skb_prev = skb_check, skb_check = skb_check->next) {
1195 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1196 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1197 					     sizeof(_tcph_check));
1198 
1199 		/* only TCP packets with matching 5-tuple are eligible, and only
1200 		 * drop safe headers
1201 		 */
1202 		if (!tcph_check || iph->version != iph_check->version ||
1203 		    tcph_check->source != tcph->source ||
1204 		    tcph_check->dest != tcph->dest)
1205 			continue;
1206 
1207 		if (iph_check->version == 4) {
1208 			if (iph_check->saddr != iph->saddr ||
1209 			    iph_check->daddr != iph->daddr)
1210 				continue;
1211 
1212 			seglen = ntohs(iph_check->tot_len) -
1213 				       (4 * iph_check->ihl);
1214 		} else if (iph_check->version == 6) {
1215 			ipv6h = (struct ipv6hdr *)iph;
1216 			ipv6h_check = (struct ipv6hdr *)iph_check;
1217 
1218 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1219 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1220 				continue;
1221 
1222 			seglen = ntohs(ipv6h_check->payload_len);
1223 		} else {
1224 			WARN_ON(1);  /* shouldn't happen */
1225 			continue;
1226 		}
1227 
1228 		/* If the ECE/CWR flags changed from the previous eligible
1229 		 * packet in the same flow, we should no longer be dropping that
1230 		 * previous packet as this would lose information.
1231 		 */
1232 		if (elig_ack && (tcp_flag_word(tcph_check) &
1233 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1234 			elig_ack = NULL;
1235 			elig_ack_prev = NULL;
1236 			num_found--;
1237 		}
1238 
1239 		/* Check TCP options and flags, don't drop ACKs with segment
1240 		 * data, and don't drop ACKs with a higher cumulative ACK
1241 		 * counter than the triggering packet. Check ACK seqno here to
1242 		 * avoid parsing SACK options of packets we are going to exclude
1243 		 * anyway.
1244 		 */
1245 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1246 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1247 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1248 			continue;
1249 
1250 		/* Check SACK options. The triggering packet must SACK more data
1251 		 * than the ACK under consideration, or SACK the same range but
1252 		 * have a larger cumulative ACK counter. The latter is a
1253 		 * pathological case, but is contained in the following check
1254 		 * anyway, just to be safe.
1255 		 */
1256 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1257 
1258 		if (sack_comp < 0 ||
1259 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1260 		     sack_comp == 0))
1261 			continue;
1262 
1263 		/* At this point we have found an eligible pure ACK to drop; if
1264 		 * we are in aggressive mode, we are done. Otherwise, keep
1265 		 * searching unless this is the second eligible ACK we
1266 		 * found.
1267 		 *
1268 		 * Since we want to drop ACK closest to the head of the queue,
1269 		 * save the first eligible ACK we find, even if we need to loop
1270 		 * again.
1271 		 */
1272 		if (!elig_ack) {
1273 			elig_ack = skb_check;
1274 			elig_ack_prev = skb_prev;
1275 			elig_flags = (tcp_flag_word(tcph_check)
1276 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1277 		}
1278 
1279 		if (num_found++ > 0)
1280 			goto found;
1281 	}
1282 
1283 	/* We made it through the queue without finding two eligible ACKs . If
1284 	 * we found a single eligible ACK we can drop it in aggressive mode if
1285 	 * we can guarantee that this does not interfere with ECN flag
1286 	 * information. We ensure this by dropping it only if the enqueued
1287 	 * packet is consecutive with the eligible ACK, and their flags match.
1288 	 */
1289 	if (elig_ack && aggressive && elig_ack->next == skb &&
1290 	    (elig_flags == (tcp_flag_word(tcph) &
1291 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1292 		goto found;
1293 
1294 	return NULL;
1295 
1296 found:
1297 	if (elig_ack_prev)
1298 		elig_ack_prev->next = elig_ack->next;
1299 	else
1300 		flow->head = elig_ack->next;
1301 
1302 	skb_mark_not_on_list(elig_ack);
1303 
1304 	return elig_ack;
1305 }
1306 
1307 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1308 {
1309 	avg -= avg >> shift;
1310 	avg += sample >> shift;
1311 	return avg;
1312 }
1313 
1314 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1315 {
1316 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1317 		len -= off;
1318 
1319 	if (q->max_netlen < len)
1320 		q->max_netlen = len;
1321 	if (q->min_netlen > len)
1322 		q->min_netlen = len;
1323 
1324 	len += q->rate_overhead;
1325 
1326 	if (len < q->rate_mpu)
1327 		len = q->rate_mpu;
1328 
1329 	if (q->atm_mode == CAKE_ATM_ATM) {
1330 		len += 47;
1331 		len /= 48;
1332 		len *= 53;
1333 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1334 		/* Add one byte per 64 bytes or part thereof.
1335 		 * This is conservative and easier to calculate than the
1336 		 * precise value.
1337 		 */
1338 		len += (len + 63) / 64;
1339 	}
1340 
1341 	if (q->max_adjlen < len)
1342 		q->max_adjlen = len;
1343 	if (q->min_adjlen > len)
1344 		q->min_adjlen = len;
1345 
1346 	return len;
1347 }
1348 
1349 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1350 {
1351 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1352 	unsigned int hdr_len, last_len = 0;
1353 	u32 off = skb_network_offset(skb);
1354 	u32 len = qdisc_pkt_len(skb);
1355 	u16 segs = 1;
1356 
1357 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1358 
1359 	if (!shinfo->gso_size)
1360 		return cake_calc_overhead(q, len, off);
1361 
1362 	/* borrowed from qdisc_pkt_len_init() */
1363 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1364 
1365 	/* + transport layer */
1366 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1367 						SKB_GSO_TCPV6))) {
1368 		const struct tcphdr *th;
1369 		struct tcphdr _tcphdr;
1370 
1371 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1372 					sizeof(_tcphdr), &_tcphdr);
1373 		if (likely(th))
1374 			hdr_len += __tcp_hdrlen(th);
1375 	} else {
1376 		struct udphdr _udphdr;
1377 
1378 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1379 				       sizeof(_udphdr), &_udphdr))
1380 			hdr_len += sizeof(struct udphdr);
1381 	}
1382 
1383 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1384 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1385 				    shinfo->gso_size);
1386 	else
1387 		segs = shinfo->gso_segs;
1388 
1389 	len = shinfo->gso_size + hdr_len;
1390 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1391 
1392 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1393 		cake_calc_overhead(q, last_len, off));
1394 }
1395 
1396 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1397 {
1398 	struct cake_heap_entry ii = q->overflow_heap[i];
1399 	struct cake_heap_entry jj = q->overflow_heap[j];
1400 
1401 	q->overflow_heap[i] = jj;
1402 	q->overflow_heap[j] = ii;
1403 
1404 	q->tins[ii.t].overflow_idx[ii.b] = j;
1405 	q->tins[jj.t].overflow_idx[jj.b] = i;
1406 }
1407 
1408 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1409 {
1410 	struct cake_heap_entry ii = q->overflow_heap[i];
1411 
1412 	return q->tins[ii.t].backlogs[ii.b];
1413 }
1414 
1415 static void cake_heapify(struct cake_sched_data *q, u16 i)
1416 {
1417 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1418 	u32 mb = cake_heap_get_backlog(q, i);
1419 	u32 m = i;
1420 
1421 	while (m < a) {
1422 		u32 l = m + m + 1;
1423 		u32 r = l + 1;
1424 
1425 		if (l < a) {
1426 			u32 lb = cake_heap_get_backlog(q, l);
1427 
1428 			if (lb > mb) {
1429 				m  = l;
1430 				mb = lb;
1431 			}
1432 		}
1433 
1434 		if (r < a) {
1435 			u32 rb = cake_heap_get_backlog(q, r);
1436 
1437 			if (rb > mb) {
1438 				m  = r;
1439 				mb = rb;
1440 			}
1441 		}
1442 
1443 		if (m != i) {
1444 			cake_heap_swap(q, i, m);
1445 			i = m;
1446 		} else {
1447 			break;
1448 		}
1449 	}
1450 }
1451 
1452 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1453 {
1454 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1455 		u16 p = (i - 1) >> 1;
1456 		u32 ib = cake_heap_get_backlog(q, i);
1457 		u32 pb = cake_heap_get_backlog(q, p);
1458 
1459 		if (ib > pb) {
1460 			cake_heap_swap(q, i, p);
1461 			i = p;
1462 		} else {
1463 			break;
1464 		}
1465 	}
1466 }
1467 
1468 static int cake_advance_shaper(struct cake_sched_data *q,
1469 			       struct cake_tin_data *b,
1470 			       struct sk_buff *skb,
1471 			       ktime_t now, bool drop)
1472 {
1473 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1474 
1475 	/* charge packet bandwidth to this tin
1476 	 * and to the global shaper.
1477 	 */
1478 	if (q->rate_ns) {
1479 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1480 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1481 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1482 
1483 		if (ktime_before(b->time_next_packet, now))
1484 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1485 							   tin_dur);
1486 
1487 		else if (ktime_before(b->time_next_packet,
1488 				      ktime_add_ns(now, tin_dur)))
1489 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1490 
1491 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1492 						   global_dur);
1493 		if (!drop)
1494 			q->failsafe_next_packet = \
1495 				ktime_add_ns(q->failsafe_next_packet,
1496 					     failsafe_dur);
1497 	}
1498 	return len;
1499 }
1500 
1501 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1502 {
1503 	struct cake_sched_data *q = qdisc_priv(sch);
1504 	ktime_t now = ktime_get();
1505 	u32 idx = 0, tin = 0, len;
1506 	struct cake_heap_entry qq;
1507 	struct cake_tin_data *b;
1508 	struct cake_flow *flow;
1509 	struct sk_buff *skb;
1510 
1511 	if (!q->overflow_timeout) {
1512 		int i;
1513 		/* Build fresh max-heap */
1514 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1515 			cake_heapify(q, i);
1516 	}
1517 	q->overflow_timeout = 65535;
1518 
1519 	/* select longest queue for pruning */
1520 	qq  = q->overflow_heap[0];
1521 	tin = qq.t;
1522 	idx = qq.b;
1523 
1524 	b = &q->tins[tin];
1525 	flow = &b->flows[idx];
1526 	skb = dequeue_head(flow);
1527 	if (unlikely(!skb)) {
1528 		/* heap has gone wrong, rebuild it next time */
1529 		q->overflow_timeout = 0;
1530 		return idx + (tin << 16);
1531 	}
1532 
1533 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1534 		b->unresponsive_flow_count++;
1535 
1536 	len = qdisc_pkt_len(skb);
1537 	q->buffer_used      -= skb->truesize;
1538 	b->backlogs[idx]    -= len;
1539 	b->tin_backlog      -= len;
1540 	sch->qstats.backlog -= len;
1541 	qdisc_tree_reduce_backlog(sch, 1, len);
1542 
1543 	flow->dropped++;
1544 	b->tin_dropped++;
1545 	sch->qstats.drops++;
1546 
1547 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1548 		cake_advance_shaper(q, b, skb, now, true);
1549 
1550 	__qdisc_drop(skb, to_free);
1551 	sch->q.qlen--;
1552 
1553 	cake_heapify(q, 0);
1554 
1555 	return idx + (tin << 16);
1556 }
1557 
1558 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1559 {
1560 	const int offset = skb_network_offset(skb);
1561 	u16 *buf, buf_;
1562 	u8 dscp;
1563 
1564 	switch (skb_protocol(skb, true)) {
1565 	case htons(ETH_P_IP):
1566 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1567 		if (unlikely(!buf))
1568 			return 0;
1569 
1570 		/* ToS is in the second byte of iphdr */
1571 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1572 
1573 		if (wash && dscp) {
1574 			const int wlen = offset + sizeof(struct iphdr);
1575 
1576 			if (!pskb_may_pull(skb, wlen) ||
1577 			    skb_try_make_writable(skb, wlen))
1578 				return 0;
1579 
1580 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1581 		}
1582 
1583 		return dscp;
1584 
1585 	case htons(ETH_P_IPV6):
1586 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1587 		if (unlikely(!buf))
1588 			return 0;
1589 
1590 		/* Traffic class is in the first and second bytes of ipv6hdr */
1591 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1592 
1593 		if (wash && dscp) {
1594 			const int wlen = offset + sizeof(struct ipv6hdr);
1595 
1596 			if (!pskb_may_pull(skb, wlen) ||
1597 			    skb_try_make_writable(skb, wlen))
1598 				return 0;
1599 
1600 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1601 		}
1602 
1603 		return dscp;
1604 
1605 	case htons(ETH_P_ARP):
1606 		return 0x38;  /* CS7 - Net Control */
1607 
1608 	default:
1609 		/* If there is no Diffserv field, treat as best-effort */
1610 		return 0;
1611 	}
1612 }
1613 
1614 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1615 					     struct sk_buff *skb)
1616 {
1617 	struct cake_sched_data *q = qdisc_priv(sch);
1618 	u32 tin, mark;
1619 	bool wash;
1620 	u8 dscp;
1621 
1622 	/* Tin selection: Default to diffserv-based selection, allow overriding
1623 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1624 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1625 	 */
1626 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1627 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1628 	if (wash)
1629 		dscp = cake_handle_diffserv(skb, wash);
1630 
1631 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1632 		tin = 0;
1633 
1634 	else if (mark && mark <= q->tin_cnt)
1635 		tin = q->tin_order[mark - 1];
1636 
1637 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1638 		 TC_H_MIN(skb->priority) > 0 &&
1639 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1640 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1641 
1642 	else {
1643 		if (!wash)
1644 			dscp = cake_handle_diffserv(skb, wash);
1645 		tin = q->tin_index[dscp];
1646 
1647 		if (unlikely(tin >= q->tin_cnt))
1648 			tin = 0;
1649 	}
1650 
1651 	return &q->tins[tin];
1652 }
1653 
1654 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1655 			 struct sk_buff *skb, int flow_mode, int *qerr)
1656 {
1657 	struct cake_sched_data *q = qdisc_priv(sch);
1658 	struct tcf_proto *filter;
1659 	struct tcf_result res;
1660 	u16 flow = 0, host = 0;
1661 	int result;
1662 
1663 	filter = rcu_dereference_bh(q->filter_list);
1664 	if (!filter)
1665 		goto hash;
1666 
1667 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1668 	result = tcf_classify(skb, NULL, filter, &res, false);
1669 
1670 	if (result >= 0) {
1671 #ifdef CONFIG_NET_CLS_ACT
1672 		switch (result) {
1673 		case TC_ACT_STOLEN:
1674 		case TC_ACT_QUEUED:
1675 		case TC_ACT_TRAP:
1676 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1677 			fallthrough;
1678 		case TC_ACT_SHOT:
1679 			return 0;
1680 		}
1681 #endif
1682 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1683 			flow = TC_H_MIN(res.classid);
1684 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1685 			host = TC_H_MAJ(res.classid) >> 16;
1686 	}
1687 hash:
1688 	*t = cake_select_tin(sch, skb);
1689 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1690 }
1691 
1692 static void cake_reconfigure(struct Qdisc *sch);
1693 
1694 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1695 			struct sk_buff **to_free)
1696 {
1697 	struct cake_sched_data *q = qdisc_priv(sch);
1698 	int len = qdisc_pkt_len(skb);
1699 	int ret;
1700 	struct sk_buff *ack = NULL;
1701 	ktime_t now = ktime_get();
1702 	struct cake_tin_data *b;
1703 	struct cake_flow *flow;
1704 	u32 idx;
1705 
1706 	/* choose flow to insert into */
1707 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1708 	if (idx == 0) {
1709 		if (ret & __NET_XMIT_BYPASS)
1710 			qdisc_qstats_drop(sch);
1711 		__qdisc_drop(skb, to_free);
1712 		return ret;
1713 	}
1714 	idx--;
1715 	flow = &b->flows[idx];
1716 
1717 	/* ensure shaper state isn't stale */
1718 	if (!b->tin_backlog) {
1719 		if (ktime_before(b->time_next_packet, now))
1720 			b->time_next_packet = now;
1721 
1722 		if (!sch->q.qlen) {
1723 			if (ktime_before(q->time_next_packet, now)) {
1724 				q->failsafe_next_packet = now;
1725 				q->time_next_packet = now;
1726 			} else if (ktime_after(q->time_next_packet, now) &&
1727 				   ktime_after(q->failsafe_next_packet, now)) {
1728 				u64 next = \
1729 					min(ktime_to_ns(q->time_next_packet),
1730 					    ktime_to_ns(
1731 						   q->failsafe_next_packet));
1732 				sch->qstats.overlimits++;
1733 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1734 			}
1735 		}
1736 	}
1737 
1738 	if (unlikely(len > b->max_skblen))
1739 		b->max_skblen = len;
1740 
1741 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1742 		struct sk_buff *segs, *nskb;
1743 		netdev_features_t features = netif_skb_features(skb);
1744 		unsigned int slen = 0, numsegs = 0;
1745 
1746 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1747 		if (IS_ERR_OR_NULL(segs))
1748 			return qdisc_drop(skb, sch, to_free);
1749 
1750 		skb_list_walk_safe(segs, segs, nskb) {
1751 			skb_mark_not_on_list(segs);
1752 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1753 			cobalt_set_enqueue_time(segs, now);
1754 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1755 									  segs);
1756 			flow_queue_add(flow, segs);
1757 
1758 			sch->q.qlen++;
1759 			numsegs++;
1760 			slen += segs->len;
1761 			q->buffer_used += segs->truesize;
1762 			b->packets++;
1763 		}
1764 
1765 		/* stats */
1766 		b->bytes	    += slen;
1767 		b->backlogs[idx]    += slen;
1768 		b->tin_backlog      += slen;
1769 		sch->qstats.backlog += slen;
1770 		q->avg_window_bytes += slen;
1771 
1772 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1773 		consume_skb(skb);
1774 	} else {
1775 		/* not splitting */
1776 		cobalt_set_enqueue_time(skb, now);
1777 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1778 		flow_queue_add(flow, skb);
1779 
1780 		if (q->ack_filter)
1781 			ack = cake_ack_filter(q, flow);
1782 
1783 		if (ack) {
1784 			b->ack_drops++;
1785 			sch->qstats.drops++;
1786 			b->bytes += qdisc_pkt_len(ack);
1787 			len -= qdisc_pkt_len(ack);
1788 			q->buffer_used += skb->truesize - ack->truesize;
1789 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1790 				cake_advance_shaper(q, b, ack, now, true);
1791 
1792 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1793 			consume_skb(ack);
1794 		} else {
1795 			sch->q.qlen++;
1796 			q->buffer_used      += skb->truesize;
1797 		}
1798 
1799 		/* stats */
1800 		b->packets++;
1801 		b->bytes	    += len;
1802 		b->backlogs[idx]    += len;
1803 		b->tin_backlog      += len;
1804 		sch->qstats.backlog += len;
1805 		q->avg_window_bytes += len;
1806 	}
1807 
1808 	if (q->overflow_timeout)
1809 		cake_heapify_up(q, b->overflow_idx[idx]);
1810 
1811 	/* incoming bandwidth capacity estimate */
1812 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1813 		u64 packet_interval = \
1814 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1815 
1816 		if (packet_interval > NSEC_PER_SEC)
1817 			packet_interval = NSEC_PER_SEC;
1818 
1819 		/* filter out short-term bursts, eg. wifi aggregation */
1820 		q->avg_packet_interval = \
1821 			cake_ewma(q->avg_packet_interval,
1822 				  packet_interval,
1823 				  (packet_interval > q->avg_packet_interval ?
1824 					  2 : 8));
1825 
1826 		q->last_packet_time = now;
1827 
1828 		if (packet_interval > q->avg_packet_interval) {
1829 			u64 window_interval = \
1830 				ktime_to_ns(ktime_sub(now,
1831 						      q->avg_window_begin));
1832 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1833 
1834 			b = div64_u64(b, window_interval);
1835 			q->avg_peak_bandwidth =
1836 				cake_ewma(q->avg_peak_bandwidth, b,
1837 					  b > q->avg_peak_bandwidth ? 2 : 8);
1838 			q->avg_window_bytes = 0;
1839 			q->avg_window_begin = now;
1840 
1841 			if (ktime_after(now,
1842 					ktime_add_ms(q->last_reconfig_time,
1843 						     250))) {
1844 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1845 				cake_reconfigure(sch);
1846 			}
1847 		}
1848 	} else {
1849 		q->avg_window_bytes = 0;
1850 		q->last_packet_time = now;
1851 	}
1852 
1853 	/* flowchain */
1854 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1855 		struct cake_host *srchost = &b->hosts[flow->srchost];
1856 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1857 		u16 host_load = 1;
1858 
1859 		if (!flow->set) {
1860 			list_add_tail(&flow->flowchain, &b->new_flows);
1861 		} else {
1862 			b->decaying_flow_count--;
1863 			list_move_tail(&flow->flowchain, &b->new_flows);
1864 		}
1865 		flow->set = CAKE_SET_SPARSE;
1866 		b->sparse_flow_count++;
1867 
1868 		if (cake_dsrc(q->flow_mode))
1869 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1870 
1871 		if (cake_ddst(q->flow_mode))
1872 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1873 
1874 		flow->deficit = (b->flow_quantum *
1875 				 quantum_div[host_load]) >> 16;
1876 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1877 		struct cake_host *srchost = &b->hosts[flow->srchost];
1878 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1879 
1880 		/* this flow was empty, accounted as a sparse flow, but actually
1881 		 * in the bulk rotation.
1882 		 */
1883 		flow->set = CAKE_SET_BULK;
1884 		b->sparse_flow_count--;
1885 		b->bulk_flow_count++;
1886 
1887 		if (cake_dsrc(q->flow_mode))
1888 			srchost->srchost_bulk_flow_count++;
1889 
1890 		if (cake_ddst(q->flow_mode))
1891 			dsthost->dsthost_bulk_flow_count++;
1892 
1893 	}
1894 
1895 	if (q->buffer_used > q->buffer_max_used)
1896 		q->buffer_max_used = q->buffer_used;
1897 
1898 	if (q->buffer_used > q->buffer_limit) {
1899 		u32 dropped = 0;
1900 
1901 		while (q->buffer_used > q->buffer_limit) {
1902 			dropped++;
1903 			cake_drop(sch, to_free);
1904 		}
1905 		b->drop_overlimit += dropped;
1906 	}
1907 	return NET_XMIT_SUCCESS;
1908 }
1909 
1910 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1911 {
1912 	struct cake_sched_data *q = qdisc_priv(sch);
1913 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1914 	struct cake_flow *flow = &b->flows[q->cur_flow];
1915 	struct sk_buff *skb = NULL;
1916 	u32 len;
1917 
1918 	if (flow->head) {
1919 		skb = dequeue_head(flow);
1920 		len = qdisc_pkt_len(skb);
1921 		b->backlogs[q->cur_flow] -= len;
1922 		b->tin_backlog		 -= len;
1923 		sch->qstats.backlog      -= len;
1924 		q->buffer_used		 -= skb->truesize;
1925 		sch->q.qlen--;
1926 
1927 		if (q->overflow_timeout)
1928 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1929 	}
1930 	return skb;
1931 }
1932 
1933 /* Discard leftover packets from a tin no longer in use. */
1934 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1935 {
1936 	struct cake_sched_data *q = qdisc_priv(sch);
1937 	struct sk_buff *skb;
1938 
1939 	q->cur_tin = tin;
1940 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1941 		while (!!(skb = cake_dequeue_one(sch)))
1942 			kfree_skb(skb);
1943 }
1944 
1945 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1946 {
1947 	struct cake_sched_data *q = qdisc_priv(sch);
1948 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1949 	struct cake_host *srchost, *dsthost;
1950 	ktime_t now = ktime_get();
1951 	struct cake_flow *flow;
1952 	struct list_head *head;
1953 	bool first_flow = true;
1954 	struct sk_buff *skb;
1955 	u16 host_load;
1956 	u64 delay;
1957 	u32 len;
1958 
1959 begin:
1960 	if (!sch->q.qlen)
1961 		return NULL;
1962 
1963 	/* global hard shaper */
1964 	if (ktime_after(q->time_next_packet, now) &&
1965 	    ktime_after(q->failsafe_next_packet, now)) {
1966 		u64 next = min(ktime_to_ns(q->time_next_packet),
1967 			       ktime_to_ns(q->failsafe_next_packet));
1968 
1969 		sch->qstats.overlimits++;
1970 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1971 		return NULL;
1972 	}
1973 
1974 	/* Choose a class to work on. */
1975 	if (!q->rate_ns) {
1976 		/* In unlimited mode, can't rely on shaper timings, just balance
1977 		 * with DRR
1978 		 */
1979 		bool wrapped = false, empty = true;
1980 
1981 		while (b->tin_deficit < 0 ||
1982 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1983 			if (b->tin_deficit <= 0)
1984 				b->tin_deficit += b->tin_quantum;
1985 			if (b->sparse_flow_count + b->bulk_flow_count)
1986 				empty = false;
1987 
1988 			q->cur_tin++;
1989 			b++;
1990 			if (q->cur_tin >= q->tin_cnt) {
1991 				q->cur_tin = 0;
1992 				b = q->tins;
1993 
1994 				if (wrapped) {
1995 					/* It's possible for q->qlen to be
1996 					 * nonzero when we actually have no
1997 					 * packets anywhere.
1998 					 */
1999 					if (empty)
2000 						return NULL;
2001 				} else {
2002 					wrapped = true;
2003 				}
2004 			}
2005 		}
2006 	} else {
2007 		/* In shaped mode, choose:
2008 		 * - Highest-priority tin with queue and meeting schedule, or
2009 		 * - The earliest-scheduled tin with queue.
2010 		 */
2011 		ktime_t best_time = KTIME_MAX;
2012 		int tin, best_tin = 0;
2013 
2014 		for (tin = 0; tin < q->tin_cnt; tin++) {
2015 			b = q->tins + tin;
2016 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2017 				ktime_t time_to_pkt = \
2018 					ktime_sub(b->time_next_packet, now);
2019 
2020 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2021 				    ktime_compare(time_to_pkt,
2022 						  best_time) <= 0) {
2023 					best_time = time_to_pkt;
2024 					best_tin = tin;
2025 				}
2026 			}
2027 		}
2028 
2029 		q->cur_tin = best_tin;
2030 		b = q->tins + best_tin;
2031 
2032 		/* No point in going further if no packets to deliver. */
2033 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2034 			return NULL;
2035 	}
2036 
2037 retry:
2038 	/* service this class */
2039 	head = &b->decaying_flows;
2040 	if (!first_flow || list_empty(head)) {
2041 		head = &b->new_flows;
2042 		if (list_empty(head)) {
2043 			head = &b->old_flows;
2044 			if (unlikely(list_empty(head))) {
2045 				head = &b->decaying_flows;
2046 				if (unlikely(list_empty(head)))
2047 					goto begin;
2048 			}
2049 		}
2050 	}
2051 	flow = list_first_entry(head, struct cake_flow, flowchain);
2052 	q->cur_flow = flow - b->flows;
2053 	first_flow = false;
2054 
2055 	/* triple isolation (modified DRR++) */
2056 	srchost = &b->hosts[flow->srchost];
2057 	dsthost = &b->hosts[flow->dsthost];
2058 	host_load = 1;
2059 
2060 	/* flow isolation (DRR++) */
2061 	if (flow->deficit <= 0) {
2062 		/* Keep all flows with deficits out of the sparse and decaying
2063 		 * rotations.  No non-empty flow can go into the decaying
2064 		 * rotation, so they can't get deficits
2065 		 */
2066 		if (flow->set == CAKE_SET_SPARSE) {
2067 			if (flow->head) {
2068 				b->sparse_flow_count--;
2069 				b->bulk_flow_count++;
2070 
2071 				if (cake_dsrc(q->flow_mode))
2072 					srchost->srchost_bulk_flow_count++;
2073 
2074 				if (cake_ddst(q->flow_mode))
2075 					dsthost->dsthost_bulk_flow_count++;
2076 
2077 				flow->set = CAKE_SET_BULK;
2078 			} else {
2079 				/* we've moved it to the bulk rotation for
2080 				 * correct deficit accounting but we still want
2081 				 * to count it as a sparse flow, not a bulk one.
2082 				 */
2083 				flow->set = CAKE_SET_SPARSE_WAIT;
2084 			}
2085 		}
2086 
2087 		if (cake_dsrc(q->flow_mode))
2088 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2089 
2090 		if (cake_ddst(q->flow_mode))
2091 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2092 
2093 		WARN_ON(host_load > CAKE_QUEUES);
2094 
2095 		/* The get_random_u16() is a way to apply dithering to avoid
2096 		 * accumulating roundoff errors
2097 		 */
2098 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2099 				  get_random_u16()) >> 16;
2100 		list_move_tail(&flow->flowchain, &b->old_flows);
2101 
2102 		goto retry;
2103 	}
2104 
2105 	/* Retrieve a packet via the AQM */
2106 	while (1) {
2107 		skb = cake_dequeue_one(sch);
2108 		if (!skb) {
2109 			/* this queue was actually empty */
2110 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2111 				b->unresponsive_flow_count--;
2112 
2113 			if (flow->cvars.p_drop || flow->cvars.count ||
2114 			    ktime_before(now, flow->cvars.drop_next)) {
2115 				/* keep in the flowchain until the state has
2116 				 * decayed to rest
2117 				 */
2118 				list_move_tail(&flow->flowchain,
2119 					       &b->decaying_flows);
2120 				if (flow->set == CAKE_SET_BULK) {
2121 					b->bulk_flow_count--;
2122 
2123 					if (cake_dsrc(q->flow_mode))
2124 						srchost->srchost_bulk_flow_count--;
2125 
2126 					if (cake_ddst(q->flow_mode))
2127 						dsthost->dsthost_bulk_flow_count--;
2128 
2129 					b->decaying_flow_count++;
2130 				} else if (flow->set == CAKE_SET_SPARSE ||
2131 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2132 					b->sparse_flow_count--;
2133 					b->decaying_flow_count++;
2134 				}
2135 				flow->set = CAKE_SET_DECAYING;
2136 			} else {
2137 				/* remove empty queue from the flowchain */
2138 				list_del_init(&flow->flowchain);
2139 				if (flow->set == CAKE_SET_SPARSE ||
2140 				    flow->set == CAKE_SET_SPARSE_WAIT)
2141 					b->sparse_flow_count--;
2142 				else if (flow->set == CAKE_SET_BULK) {
2143 					b->bulk_flow_count--;
2144 
2145 					if (cake_dsrc(q->flow_mode))
2146 						srchost->srchost_bulk_flow_count--;
2147 
2148 					if (cake_ddst(q->flow_mode))
2149 						dsthost->dsthost_bulk_flow_count--;
2150 
2151 				} else
2152 					b->decaying_flow_count--;
2153 
2154 				flow->set = CAKE_SET_NONE;
2155 			}
2156 			goto begin;
2157 		}
2158 
2159 		/* Last packet in queue may be marked, shouldn't be dropped */
2160 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2161 					(b->bulk_flow_count *
2162 					 !!(q->rate_flags &
2163 					    CAKE_FLAG_INGRESS))) ||
2164 		    !flow->head)
2165 			break;
2166 
2167 		/* drop this packet, get another one */
2168 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2169 			len = cake_advance_shaper(q, b, skb,
2170 						  now, true);
2171 			flow->deficit -= len;
2172 			b->tin_deficit -= len;
2173 		}
2174 		flow->dropped++;
2175 		b->tin_dropped++;
2176 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2177 		qdisc_qstats_drop(sch);
2178 		kfree_skb(skb);
2179 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2180 			goto retry;
2181 	}
2182 
2183 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2184 	qdisc_bstats_update(sch, skb);
2185 
2186 	/* collect delay stats */
2187 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2188 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2189 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2190 				  delay > b->peak_delay ? 2 : 8);
2191 	b->base_delay = cake_ewma(b->base_delay, delay,
2192 				  delay < b->base_delay ? 2 : 8);
2193 
2194 	len = cake_advance_shaper(q, b, skb, now, false);
2195 	flow->deficit -= len;
2196 	b->tin_deficit -= len;
2197 
2198 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2199 		u64 next = min(ktime_to_ns(q->time_next_packet),
2200 			       ktime_to_ns(q->failsafe_next_packet));
2201 
2202 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2203 	} else if (!sch->q.qlen) {
2204 		int i;
2205 
2206 		for (i = 0; i < q->tin_cnt; i++) {
2207 			if (q->tins[i].decaying_flow_count) {
2208 				ktime_t next = \
2209 					ktime_add_ns(now,
2210 						     q->tins[i].cparams.target);
2211 
2212 				qdisc_watchdog_schedule_ns(&q->watchdog,
2213 							   ktime_to_ns(next));
2214 				break;
2215 			}
2216 		}
2217 	}
2218 
2219 	if (q->overflow_timeout)
2220 		q->overflow_timeout--;
2221 
2222 	return skb;
2223 }
2224 
2225 static void cake_reset(struct Qdisc *sch)
2226 {
2227 	struct cake_sched_data *q = qdisc_priv(sch);
2228 	u32 c;
2229 
2230 	if (!q->tins)
2231 		return;
2232 
2233 	for (c = 0; c < CAKE_MAX_TINS; c++)
2234 		cake_clear_tin(sch, c);
2235 }
2236 
2237 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2238 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2239 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2240 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2241 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2242 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2243 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2244 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2245 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2246 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2247 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2248 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2249 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2250 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2251 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2252 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2253 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2254 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2255 };
2256 
2257 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2258 			  u64 target_ns, u64 rtt_est_ns)
2259 {
2260 	/* convert byte-rate into time-per-byte
2261 	 * so it will always unwedge in reasonable time.
2262 	 */
2263 	static const u64 MIN_RATE = 64;
2264 	u32 byte_target = mtu;
2265 	u64 byte_target_ns;
2266 	u8  rate_shft = 0;
2267 	u64 rate_ns = 0;
2268 
2269 	b->flow_quantum = 1514;
2270 	if (rate) {
2271 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2272 		rate_shft = 34;
2273 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2274 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2275 		while (!!(rate_ns >> 34)) {
2276 			rate_ns >>= 1;
2277 			rate_shft--;
2278 		}
2279 	} /* else unlimited, ie. zero delay */
2280 
2281 	b->tin_rate_bps  = rate;
2282 	b->tin_rate_ns   = rate_ns;
2283 	b->tin_rate_shft = rate_shft;
2284 
2285 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2286 
2287 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2288 	b->cparams.interval = max(rtt_est_ns +
2289 				     b->cparams.target - target_ns,
2290 				     b->cparams.target * 2);
2291 	b->cparams.mtu_time = byte_target_ns;
2292 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2293 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2294 }
2295 
2296 static int cake_config_besteffort(struct Qdisc *sch)
2297 {
2298 	struct cake_sched_data *q = qdisc_priv(sch);
2299 	struct cake_tin_data *b = &q->tins[0];
2300 	u32 mtu = psched_mtu(qdisc_dev(sch));
2301 	u64 rate = q->rate_bps;
2302 
2303 	q->tin_cnt = 1;
2304 
2305 	q->tin_index = besteffort;
2306 	q->tin_order = normal_order;
2307 
2308 	cake_set_rate(b, rate, mtu,
2309 		      us_to_ns(q->target), us_to_ns(q->interval));
2310 	b->tin_quantum = 65535;
2311 
2312 	return 0;
2313 }
2314 
2315 static int cake_config_precedence(struct Qdisc *sch)
2316 {
2317 	/* convert high-level (user visible) parameters into internal format */
2318 	struct cake_sched_data *q = qdisc_priv(sch);
2319 	u32 mtu = psched_mtu(qdisc_dev(sch));
2320 	u64 rate = q->rate_bps;
2321 	u32 quantum = 256;
2322 	u32 i;
2323 
2324 	q->tin_cnt = 8;
2325 	q->tin_index = precedence;
2326 	q->tin_order = normal_order;
2327 
2328 	for (i = 0; i < q->tin_cnt; i++) {
2329 		struct cake_tin_data *b = &q->tins[i];
2330 
2331 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2332 			      us_to_ns(q->interval));
2333 
2334 		b->tin_quantum = max_t(u16, 1U, quantum);
2335 
2336 		/* calculate next class's parameters */
2337 		rate  *= 7;
2338 		rate >>= 3;
2339 
2340 		quantum  *= 7;
2341 		quantum >>= 3;
2342 	}
2343 
2344 	return 0;
2345 }
2346 
2347 /*	List of known Diffserv codepoints:
2348  *
2349  *	Default Forwarding (DF/CS0) - Best Effort
2350  *	Max Throughput (TOS2)
2351  *	Min Delay (TOS4)
2352  *	LLT "La" (TOS5)
2353  *	Assured Forwarding 1 (AF1x) - x3
2354  *	Assured Forwarding 2 (AF2x) - x3
2355  *	Assured Forwarding 3 (AF3x) - x3
2356  *	Assured Forwarding 4 (AF4x) - x3
2357  *	Precedence Class 1 (CS1)
2358  *	Precedence Class 2 (CS2)
2359  *	Precedence Class 3 (CS3)
2360  *	Precedence Class 4 (CS4)
2361  *	Precedence Class 5 (CS5)
2362  *	Precedence Class 6 (CS6)
2363  *	Precedence Class 7 (CS7)
2364  *	Voice Admit (VA)
2365  *	Expedited Forwarding (EF)
2366  *	Lower Effort (LE)
2367  *
2368  *	Total 26 codepoints.
2369  */
2370 
2371 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2372  *		(roughly descending order of contended priority)
2373  *		(roughly ascending order of uncontended throughput)
2374  *
2375  *	Network Control (CS6,CS7)      - routing traffic
2376  *	Telephony (EF,VA)         - aka. VoIP streams
2377  *	Signalling (CS5)               - VoIP setup
2378  *	Multimedia Conferencing (AF4x) - aka. video calls
2379  *	Realtime Interactive (CS4)     - eg. games
2380  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2381  *	Broadcast Video (CS3)
2382  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2383  *	Ops, Admin, Management (CS2)      - eg. ssh
2384  *	Standard Service (DF & unrecognised codepoints)
2385  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2386  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2387  *
2388  *	Total 12 traffic classes.
2389  */
2390 
2391 static int cake_config_diffserv8(struct Qdisc *sch)
2392 {
2393 /*	Pruned list of traffic classes for typical applications:
2394  *
2395  *		Network Control          (CS6, CS7)
2396  *		Minimum Latency          (EF, VA, CS5, CS4)
2397  *		Interactive Shell        (CS2)
2398  *		Low Latency Transactions (AF2x, TOS4)
2399  *		Video Streaming          (AF4x, AF3x, CS3)
2400  *		Bog Standard             (DF etc.)
2401  *		High Throughput          (AF1x, TOS2, CS1)
2402  *		Background Traffic       (LE)
2403  *
2404  *		Total 8 traffic classes.
2405  */
2406 
2407 	struct cake_sched_data *q = qdisc_priv(sch);
2408 	u32 mtu = psched_mtu(qdisc_dev(sch));
2409 	u64 rate = q->rate_bps;
2410 	u32 quantum = 256;
2411 	u32 i;
2412 
2413 	q->tin_cnt = 8;
2414 
2415 	/* codepoint to class mapping */
2416 	q->tin_index = diffserv8;
2417 	q->tin_order = normal_order;
2418 
2419 	/* class characteristics */
2420 	for (i = 0; i < q->tin_cnt; i++) {
2421 		struct cake_tin_data *b = &q->tins[i];
2422 
2423 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2424 			      us_to_ns(q->interval));
2425 
2426 		b->tin_quantum = max_t(u16, 1U, quantum);
2427 
2428 		/* calculate next class's parameters */
2429 		rate  *= 7;
2430 		rate >>= 3;
2431 
2432 		quantum  *= 7;
2433 		quantum >>= 3;
2434 	}
2435 
2436 	return 0;
2437 }
2438 
2439 static int cake_config_diffserv4(struct Qdisc *sch)
2440 {
2441 /*  Further pruned list of traffic classes for four-class system:
2442  *
2443  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2444  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2445  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2446  *	    Background Traffic (LE, CS1)
2447  *
2448  *		Total 4 traffic classes.
2449  */
2450 
2451 	struct cake_sched_data *q = qdisc_priv(sch);
2452 	u32 mtu = psched_mtu(qdisc_dev(sch));
2453 	u64 rate = q->rate_bps;
2454 	u32 quantum = 1024;
2455 
2456 	q->tin_cnt = 4;
2457 
2458 	/* codepoint to class mapping */
2459 	q->tin_index = diffserv4;
2460 	q->tin_order = bulk_order;
2461 
2462 	/* class characteristics */
2463 	cake_set_rate(&q->tins[0], rate, mtu,
2464 		      us_to_ns(q->target), us_to_ns(q->interval));
2465 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2466 		      us_to_ns(q->target), us_to_ns(q->interval));
2467 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2468 		      us_to_ns(q->target), us_to_ns(q->interval));
2469 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2470 		      us_to_ns(q->target), us_to_ns(q->interval));
2471 
2472 	/* bandwidth-sharing weights */
2473 	q->tins[0].tin_quantum = quantum;
2474 	q->tins[1].tin_quantum = quantum >> 4;
2475 	q->tins[2].tin_quantum = quantum >> 1;
2476 	q->tins[3].tin_quantum = quantum >> 2;
2477 
2478 	return 0;
2479 }
2480 
2481 static int cake_config_diffserv3(struct Qdisc *sch)
2482 {
2483 /*  Simplified Diffserv structure with 3 tins.
2484  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2485  *		Best Effort
2486  *		Low Priority		(LE, CS1)
2487  */
2488 	struct cake_sched_data *q = qdisc_priv(sch);
2489 	u32 mtu = psched_mtu(qdisc_dev(sch));
2490 	u64 rate = q->rate_bps;
2491 	u32 quantum = 1024;
2492 
2493 	q->tin_cnt = 3;
2494 
2495 	/* codepoint to class mapping */
2496 	q->tin_index = diffserv3;
2497 	q->tin_order = bulk_order;
2498 
2499 	/* class characteristics */
2500 	cake_set_rate(&q->tins[0], rate, mtu,
2501 		      us_to_ns(q->target), us_to_ns(q->interval));
2502 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2503 		      us_to_ns(q->target), us_to_ns(q->interval));
2504 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2505 		      us_to_ns(q->target), us_to_ns(q->interval));
2506 
2507 	/* bandwidth-sharing weights */
2508 	q->tins[0].tin_quantum = quantum;
2509 	q->tins[1].tin_quantum = quantum >> 4;
2510 	q->tins[2].tin_quantum = quantum >> 2;
2511 
2512 	return 0;
2513 }
2514 
2515 static void cake_reconfigure(struct Qdisc *sch)
2516 {
2517 	struct cake_sched_data *q = qdisc_priv(sch);
2518 	int c, ft;
2519 
2520 	switch (q->tin_mode) {
2521 	case CAKE_DIFFSERV_BESTEFFORT:
2522 		ft = cake_config_besteffort(sch);
2523 		break;
2524 
2525 	case CAKE_DIFFSERV_PRECEDENCE:
2526 		ft = cake_config_precedence(sch);
2527 		break;
2528 
2529 	case CAKE_DIFFSERV_DIFFSERV8:
2530 		ft = cake_config_diffserv8(sch);
2531 		break;
2532 
2533 	case CAKE_DIFFSERV_DIFFSERV4:
2534 		ft = cake_config_diffserv4(sch);
2535 		break;
2536 
2537 	case CAKE_DIFFSERV_DIFFSERV3:
2538 	default:
2539 		ft = cake_config_diffserv3(sch);
2540 		break;
2541 	}
2542 
2543 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2544 		cake_clear_tin(sch, c);
2545 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2546 	}
2547 
2548 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2549 	q->rate_shft = q->tins[ft].tin_rate_shft;
2550 
2551 	if (q->buffer_config_limit) {
2552 		q->buffer_limit = q->buffer_config_limit;
2553 	} else if (q->rate_bps) {
2554 		u64 t = q->rate_bps * q->interval;
2555 
2556 		do_div(t, USEC_PER_SEC / 4);
2557 		q->buffer_limit = max_t(u32, t, 4U << 20);
2558 	} else {
2559 		q->buffer_limit = ~0;
2560 	}
2561 
2562 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2563 
2564 	q->buffer_limit = min(q->buffer_limit,
2565 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2566 				  q->buffer_config_limit));
2567 }
2568 
2569 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2570 		       struct netlink_ext_ack *extack)
2571 {
2572 	struct cake_sched_data *q = qdisc_priv(sch);
2573 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2574 	int err;
2575 
2576 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2577 					  extack);
2578 	if (err < 0)
2579 		return err;
2580 
2581 	if (tb[TCA_CAKE_NAT]) {
2582 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2583 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2584 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2585 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2586 #else
2587 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2588 				    "No conntrack support in kernel");
2589 		return -EOPNOTSUPP;
2590 #endif
2591 	}
2592 
2593 	if (tb[TCA_CAKE_BASE_RATE64])
2594 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2595 
2596 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2597 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2598 
2599 	if (tb[TCA_CAKE_WASH]) {
2600 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2601 			q->rate_flags |= CAKE_FLAG_WASH;
2602 		else
2603 			q->rate_flags &= ~CAKE_FLAG_WASH;
2604 	}
2605 
2606 	if (tb[TCA_CAKE_FLOW_MODE])
2607 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2608 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2609 					CAKE_FLOW_MASK));
2610 
2611 	if (tb[TCA_CAKE_ATM])
2612 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2613 
2614 	if (tb[TCA_CAKE_OVERHEAD]) {
2615 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2616 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2617 
2618 		q->max_netlen = 0;
2619 		q->max_adjlen = 0;
2620 		q->min_netlen = ~0;
2621 		q->min_adjlen = ~0;
2622 	}
2623 
2624 	if (tb[TCA_CAKE_RAW]) {
2625 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2626 
2627 		q->max_netlen = 0;
2628 		q->max_adjlen = 0;
2629 		q->min_netlen = ~0;
2630 		q->min_adjlen = ~0;
2631 	}
2632 
2633 	if (tb[TCA_CAKE_MPU])
2634 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2635 
2636 	if (tb[TCA_CAKE_RTT]) {
2637 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2638 
2639 		if (!q->interval)
2640 			q->interval = 1;
2641 	}
2642 
2643 	if (tb[TCA_CAKE_TARGET]) {
2644 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2645 
2646 		if (!q->target)
2647 			q->target = 1;
2648 	}
2649 
2650 	if (tb[TCA_CAKE_AUTORATE]) {
2651 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2652 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2653 		else
2654 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2655 	}
2656 
2657 	if (tb[TCA_CAKE_INGRESS]) {
2658 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2659 			q->rate_flags |= CAKE_FLAG_INGRESS;
2660 		else
2661 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2662 	}
2663 
2664 	if (tb[TCA_CAKE_ACK_FILTER])
2665 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2666 
2667 	if (tb[TCA_CAKE_MEMORY])
2668 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2669 
2670 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2671 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2672 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2673 		else
2674 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2675 	}
2676 
2677 	if (tb[TCA_CAKE_FWMARK]) {
2678 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2679 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2680 	}
2681 
2682 	if (q->tins) {
2683 		sch_tree_lock(sch);
2684 		cake_reconfigure(sch);
2685 		sch_tree_unlock(sch);
2686 	}
2687 
2688 	return 0;
2689 }
2690 
2691 static void cake_destroy(struct Qdisc *sch)
2692 {
2693 	struct cake_sched_data *q = qdisc_priv(sch);
2694 
2695 	qdisc_watchdog_cancel(&q->watchdog);
2696 	tcf_block_put(q->block);
2697 	kvfree(q->tins);
2698 }
2699 
2700 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2701 		     struct netlink_ext_ack *extack)
2702 {
2703 	struct cake_sched_data *q = qdisc_priv(sch);
2704 	int i, j, err;
2705 
2706 	sch->limit = 10240;
2707 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2708 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2709 
2710 	q->rate_bps = 0; /* unlimited by default */
2711 
2712 	q->interval = 100000; /* 100ms default */
2713 	q->target   =   5000; /* 5ms: codel RFC argues
2714 			       * for 5 to 10% of interval
2715 			       */
2716 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2717 	q->cur_tin = 0;
2718 	q->cur_flow  = 0;
2719 
2720 	qdisc_watchdog_init(&q->watchdog, sch);
2721 
2722 	if (opt) {
2723 		err = cake_change(sch, opt, extack);
2724 
2725 		if (err)
2726 			return err;
2727 	}
2728 
2729 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2730 	if (err)
2731 		return err;
2732 
2733 	quantum_div[0] = ~0;
2734 	for (i = 1; i <= CAKE_QUEUES; i++)
2735 		quantum_div[i] = 65535 / i;
2736 
2737 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2738 			   GFP_KERNEL);
2739 	if (!q->tins)
2740 		return -ENOMEM;
2741 
2742 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2743 		struct cake_tin_data *b = q->tins + i;
2744 
2745 		INIT_LIST_HEAD(&b->new_flows);
2746 		INIT_LIST_HEAD(&b->old_flows);
2747 		INIT_LIST_HEAD(&b->decaying_flows);
2748 		b->sparse_flow_count = 0;
2749 		b->bulk_flow_count = 0;
2750 		b->decaying_flow_count = 0;
2751 
2752 		for (j = 0; j < CAKE_QUEUES; j++) {
2753 			struct cake_flow *flow = b->flows + j;
2754 			u32 k = j * CAKE_MAX_TINS + i;
2755 
2756 			INIT_LIST_HEAD(&flow->flowchain);
2757 			cobalt_vars_init(&flow->cvars);
2758 
2759 			q->overflow_heap[k].t = i;
2760 			q->overflow_heap[k].b = j;
2761 			b->overflow_idx[j] = k;
2762 		}
2763 	}
2764 
2765 	cake_reconfigure(sch);
2766 	q->avg_peak_bandwidth = q->rate_bps;
2767 	q->min_netlen = ~0;
2768 	q->min_adjlen = ~0;
2769 	return 0;
2770 }
2771 
2772 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2773 {
2774 	struct cake_sched_data *q = qdisc_priv(sch);
2775 	struct nlattr *opts;
2776 
2777 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2778 	if (!opts)
2779 		goto nla_put_failure;
2780 
2781 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2782 			      TCA_CAKE_PAD))
2783 		goto nla_put_failure;
2784 
2785 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2786 			q->flow_mode & CAKE_FLOW_MASK))
2787 		goto nla_put_failure;
2788 
2789 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2790 		goto nla_put_failure;
2791 
2792 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2793 		goto nla_put_failure;
2794 
2795 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2796 		goto nla_put_failure;
2797 
2798 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2799 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2800 		goto nla_put_failure;
2801 
2802 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2803 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2804 		goto nla_put_failure;
2805 
2806 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2807 		goto nla_put_failure;
2808 
2809 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2810 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2811 		goto nla_put_failure;
2812 
2813 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2814 		goto nla_put_failure;
2815 
2816 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2817 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2818 		goto nla_put_failure;
2819 
2820 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2821 		goto nla_put_failure;
2822 
2823 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2824 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2825 			goto nla_put_failure;
2826 
2827 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2828 		goto nla_put_failure;
2829 
2830 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2831 		goto nla_put_failure;
2832 
2833 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2834 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2835 		goto nla_put_failure;
2836 
2837 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2838 		goto nla_put_failure;
2839 
2840 	return nla_nest_end(skb, opts);
2841 
2842 nla_put_failure:
2843 	return -1;
2844 }
2845 
2846 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2847 {
2848 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2849 	struct cake_sched_data *q = qdisc_priv(sch);
2850 	struct nlattr *tstats, *ts;
2851 	int i;
2852 
2853 	if (!stats)
2854 		return -1;
2855 
2856 #define PUT_STAT_U32(attr, data) do {				       \
2857 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2858 			goto nla_put_failure;			       \
2859 	} while (0)
2860 #define PUT_STAT_U64(attr, data) do {				       \
2861 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2862 					data, TCA_CAKE_STATS_PAD)) \
2863 			goto nla_put_failure;			       \
2864 	} while (0)
2865 
2866 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2867 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2868 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2869 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2870 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2871 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2872 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2873 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2874 
2875 #undef PUT_STAT_U32
2876 #undef PUT_STAT_U64
2877 
2878 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2879 	if (!tstats)
2880 		goto nla_put_failure;
2881 
2882 #define PUT_TSTAT_U32(attr, data) do {					\
2883 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2884 			goto nla_put_failure;				\
2885 	} while (0)
2886 #define PUT_TSTAT_U64(attr, data) do {					\
2887 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2888 					data, TCA_CAKE_TIN_STATS_PAD))	\
2889 			goto nla_put_failure;				\
2890 	} while (0)
2891 
2892 	for (i = 0; i < q->tin_cnt; i++) {
2893 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2894 
2895 		ts = nla_nest_start_noflag(d->skb, i + 1);
2896 		if (!ts)
2897 			goto nla_put_failure;
2898 
2899 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2900 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2901 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2902 
2903 		PUT_TSTAT_U32(TARGET_US,
2904 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2905 		PUT_TSTAT_U32(INTERVAL_US,
2906 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2907 
2908 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2909 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2910 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2911 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2912 
2913 		PUT_TSTAT_U32(PEAK_DELAY_US,
2914 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2915 		PUT_TSTAT_U32(AVG_DELAY_US,
2916 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2917 		PUT_TSTAT_U32(BASE_DELAY_US,
2918 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2919 
2920 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2921 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2922 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2923 
2924 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2925 					    b->decaying_flow_count);
2926 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2927 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2928 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2929 
2930 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2931 		nla_nest_end(d->skb, ts);
2932 	}
2933 
2934 #undef PUT_TSTAT_U32
2935 #undef PUT_TSTAT_U64
2936 
2937 	nla_nest_end(d->skb, tstats);
2938 	return nla_nest_end(d->skb, stats);
2939 
2940 nla_put_failure:
2941 	nla_nest_cancel(d->skb, stats);
2942 	return -1;
2943 }
2944 
2945 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2946 {
2947 	return NULL;
2948 }
2949 
2950 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2951 {
2952 	return 0;
2953 }
2954 
2955 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2956 			       u32 classid)
2957 {
2958 	return 0;
2959 }
2960 
2961 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2962 {
2963 }
2964 
2965 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2966 					struct netlink_ext_ack *extack)
2967 {
2968 	struct cake_sched_data *q = qdisc_priv(sch);
2969 
2970 	if (cl)
2971 		return NULL;
2972 	return q->block;
2973 }
2974 
2975 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2976 			   struct sk_buff *skb, struct tcmsg *tcm)
2977 {
2978 	tcm->tcm_handle |= TC_H_MIN(cl);
2979 	return 0;
2980 }
2981 
2982 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2983 				 struct gnet_dump *d)
2984 {
2985 	struct cake_sched_data *q = qdisc_priv(sch);
2986 	const struct cake_flow *flow = NULL;
2987 	struct gnet_stats_queue qs = { 0 };
2988 	struct nlattr *stats;
2989 	u32 idx = cl - 1;
2990 
2991 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2992 		const struct cake_tin_data *b = \
2993 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2994 		const struct sk_buff *skb;
2995 
2996 		flow = &b->flows[idx % CAKE_QUEUES];
2997 
2998 		if (flow->head) {
2999 			sch_tree_lock(sch);
3000 			skb = flow->head;
3001 			while (skb) {
3002 				qs.qlen++;
3003 				skb = skb->next;
3004 			}
3005 			sch_tree_unlock(sch);
3006 		}
3007 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3008 		qs.drops = flow->dropped;
3009 	}
3010 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3011 		return -1;
3012 	if (flow) {
3013 		ktime_t now = ktime_get();
3014 
3015 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3016 		if (!stats)
3017 			return -1;
3018 
3019 #define PUT_STAT_U32(attr, data) do {				       \
3020 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3021 			goto nla_put_failure;			       \
3022 	} while (0)
3023 #define PUT_STAT_S32(attr, data) do {				       \
3024 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3025 			goto nla_put_failure;			       \
3026 	} while (0)
3027 
3028 		PUT_STAT_S32(DEFICIT, flow->deficit);
3029 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3030 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3031 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3032 		if (flow->cvars.p_drop) {
3033 			PUT_STAT_S32(BLUE_TIMER_US,
3034 				     ktime_to_us(
3035 					     ktime_sub(now,
3036 						       flow->cvars.blue_timer)));
3037 		}
3038 		if (flow->cvars.dropping) {
3039 			PUT_STAT_S32(DROP_NEXT_US,
3040 				     ktime_to_us(
3041 					     ktime_sub(now,
3042 						       flow->cvars.drop_next)));
3043 		}
3044 
3045 		if (nla_nest_end(d->skb, stats) < 0)
3046 			return -1;
3047 	}
3048 
3049 	return 0;
3050 
3051 nla_put_failure:
3052 	nla_nest_cancel(d->skb, stats);
3053 	return -1;
3054 }
3055 
3056 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3057 {
3058 	struct cake_sched_data *q = qdisc_priv(sch);
3059 	unsigned int i, j;
3060 
3061 	if (arg->stop)
3062 		return;
3063 
3064 	for (i = 0; i < q->tin_cnt; i++) {
3065 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3066 
3067 		for (j = 0; j < CAKE_QUEUES; j++) {
3068 			if (list_empty(&b->flows[j].flowchain)) {
3069 				arg->count++;
3070 				continue;
3071 			}
3072 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3073 						 arg))
3074 				break;
3075 		}
3076 	}
3077 }
3078 
3079 static const struct Qdisc_class_ops cake_class_ops = {
3080 	.leaf		=	cake_leaf,
3081 	.find		=	cake_find,
3082 	.tcf_block	=	cake_tcf_block,
3083 	.bind_tcf	=	cake_bind,
3084 	.unbind_tcf	=	cake_unbind,
3085 	.dump		=	cake_dump_class,
3086 	.dump_stats	=	cake_dump_class_stats,
3087 	.walk		=	cake_walk,
3088 };
3089 
3090 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3091 	.cl_ops		=	&cake_class_ops,
3092 	.id		=	"cake",
3093 	.priv_size	=	sizeof(struct cake_sched_data),
3094 	.enqueue	=	cake_enqueue,
3095 	.dequeue	=	cake_dequeue,
3096 	.peek		=	qdisc_peek_dequeued,
3097 	.init		=	cake_init,
3098 	.reset		=	cake_reset,
3099 	.destroy	=	cake_destroy,
3100 	.change		=	cake_change,
3101 	.dump		=	cake_dump,
3102 	.dump_stats	=	cake_dump_stats,
3103 	.owner		=	THIS_MODULE,
3104 };
3105 
3106 static int __init cake_module_init(void)
3107 {
3108 	return register_qdisc(&cake_qdisc_ops);
3109 }
3110 
3111 static void __exit cake_module_exit(void)
3112 {
3113 	unregister_qdisc(&cake_qdisc_ops);
3114 }
3115 
3116 module_init(cake_module_init)
3117 module_exit(cake_module_exit)
3118 MODULE_AUTHOR("Jonathan Morton");
3119 MODULE_LICENSE("Dual BSD/GPL");
3120 MODULE_DESCRIPTION("The CAKE shaper.");
3121