xref: /openbmc/linux/net/sched/sch_cake.c (revision 95dbf14b)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum;
177 	s32	tin_deficit;
178 	u32	tin_backlog;
179 	u32	tin_dropped;
180 	u32	tin_ecn_mark;
181 
182 	u32	packets;
183 	u64	bytes;
184 
185 	u32	ack_drops;
186 
187 	/* moving averages */
188 	u64 avge_delay;
189 	u64 peak_delay;
190 	u64 base_delay;
191 
192 	/* hash function stats */
193 	u32	way_directs;
194 	u32	way_hits;
195 	u32	way_misses;
196 	u32	way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198 
199 struct cake_sched_data {
200 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
201 	struct tcf_block *block;
202 	struct cake_tin_data *tins;
203 
204 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205 	u16		overflow_timeout;
206 
207 	u16		tin_cnt;
208 	u8		tin_mode;
209 	u8		flow_mode;
210 	u8		ack_filter;
211 	u8		atm_mode;
212 
213 	u32		fwmark_mask;
214 	u16		fwmark_shft;
215 
216 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217 	u16		rate_shft;
218 	ktime_t		time_next_packet;
219 	ktime_t		failsafe_next_packet;
220 	u64		rate_ns;
221 	u64		rate_bps;
222 	u16		rate_flags;
223 	s16		rate_overhead;
224 	u16		rate_mpu;
225 	u64		interval;
226 	u64		target;
227 
228 	/* resource tracking */
229 	u32		buffer_used;
230 	u32		buffer_max_used;
231 	u32		buffer_limit;
232 	u32		buffer_config_limit;
233 
234 	/* indices for dequeue */
235 	u16		cur_tin;
236 	u16		cur_flow;
237 
238 	struct qdisc_watchdog watchdog;
239 	const u8	*tin_index;
240 	const u8	*tin_order;
241 
242 	/* bandwidth capacity estimate */
243 	ktime_t		last_packet_time;
244 	ktime_t		avg_window_begin;
245 	u64		avg_packet_interval;
246 	u64		avg_window_bytes;
247 	u64		avg_peak_bandwidth;
248 	ktime_t		last_reconfig_time;
249 
250 	/* packet length stats */
251 	u32		avg_netoff;
252 	u16		max_netlen;
253 	u16		max_adjlen;
254 	u16		min_netlen;
255 	u16		min_adjlen;
256 };
257 
258 enum {
259 	CAKE_FLAG_OVERHEAD	   = BIT(0),
260 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261 	CAKE_FLAG_INGRESS	   = BIT(2),
262 	CAKE_FLAG_WASH		   = BIT(3),
263 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
264 };
265 
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271 
272 struct cobalt_skb_cb {
273 	ktime_t enqueue_time;
274 	u32     adjusted_len;
275 };
276 
277 static u64 us_to_ns(u64 us)
278 {
279 	return us * NSEC_PER_USEC;
280 }
281 
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287 
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290 	return get_cobalt_cb(skb)->enqueue_time;
291 }
292 
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294 				    ktime_t now)
295 {
296 	get_cobalt_cb(skb)->enqueue_time = now;
297 }
298 
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300 
301 /* Diffserv lookup tables */
302 
303 static const u8 precedence[] = {
304 	0, 0, 0, 0, 0, 0, 0, 0,
305 	1, 1, 1, 1, 1, 1, 1, 1,
306 	2, 2, 2, 2, 2, 2, 2, 2,
307 	3, 3, 3, 3, 3, 3, 3, 3,
308 	4, 4, 4, 4, 4, 4, 4, 4,
309 	5, 5, 5, 5, 5, 5, 5, 5,
310 	6, 6, 6, 6, 6, 6, 6, 6,
311 	7, 7, 7, 7, 7, 7, 7, 7,
312 };
313 
314 static const u8 diffserv8[] = {
315 	2, 5, 1, 2, 4, 2, 2, 2,
316 	0, 2, 1, 2, 1, 2, 1, 2,
317 	5, 2, 4, 2, 4, 2, 4, 2,
318 	3, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 2, 2, 6, 2, 6, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 };
324 
325 static const u8 diffserv4[] = {
326 	0, 2, 0, 0, 2, 0, 0, 0,
327 	1, 0, 0, 0, 0, 0, 0, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 0, 0, 3, 0, 3, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 };
335 
336 static const u8 diffserv3[] = {
337 	0, 0, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 2, 0, 2, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 besteffort[] = {
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 /* tin priority order for stats dumping */
359 
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362 
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365 
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371 
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374 	u32 invsqrt, invsqrt2;
375 	u64 val;
376 
377 	invsqrt = vars->rec_inv_sqrt;
378 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380 
381 	val >>= 2; /* avoid overflow in following multiply */
382 	val = (val * invsqrt) >> (32 - 2 + 1);
383 
384 	vars->rec_inv_sqrt = val;
385 }
386 
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389 	if (vars->count < REC_INV_SQRT_CACHE)
390 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391 	else
392 		cobalt_newton_step(vars);
393 }
394 
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404 
405 static void cobalt_cache_init(void)
406 {
407 	struct cobalt_vars v;
408 
409 	memset(&v, 0, sizeof(v));
410 	v.rec_inv_sqrt = ~0U;
411 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412 
413 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 
419 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420 	}
421 }
422 
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425 	memset(vars, 0, sizeof(*vars));
426 
427 	if (!cobalt_rec_inv_sqrt_cache[0]) {
428 		cobalt_cache_init();
429 		cobalt_rec_inv_sqrt_cache[0] = ~0;
430 	}
431 }
432 
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438 			      u64 interval,
439 			      u32 rec_inv_sqrt)
440 {
441 	return ktime_add_ns(t, reciprocal_scale(interval,
442 						rec_inv_sqrt));
443 }
444 
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449 			      struct cobalt_params *p,
450 			      ktime_t now)
451 {
452 	bool up = false;
453 
454 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455 		up = !vars->p_drop;
456 		vars->p_drop += p->p_inc;
457 		if (vars->p_drop < p->p_inc)
458 			vars->p_drop = ~0;
459 		vars->blue_timer = now;
460 	}
461 	vars->dropping = true;
462 	vars->drop_next = now;
463 	if (!vars->count)
464 		vars->count = 1;
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			vars->p_drop = 0;
482 		else
483 			vars->p_drop -= p->p_dec;
484 		vars->blue_timer = now;
485 		down = !vars->p_drop;
486 	}
487 	vars->dropping = false;
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		vars->count--;
491 		cobalt_invsqrt(vars);
492 		vars->drop_next = cobalt_control(vars->drop_next,
493 						 p->interval,
494 						 vars->rec_inv_sqrt);
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504 			       struct cobalt_params *p,
505 			       ktime_t now,
506 			       struct sk_buff *skb,
507 			       u32 bulk_flows)
508 {
509 	bool next_due, over_target, drop = false;
510 	ktime_t schedule;
511 	u64 sojourn;
512 
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527 
528 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529 	schedule = ktime_sub(now, vars->drop_next);
530 	over_target = sojourn > p->target &&
531 		      sojourn > p->mtu_time * bulk_flows * 2 &&
532 		      sojourn > p->mtu_time * 4;
533 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
534 
535 	vars->ecn_marked = false;
536 
537 	if (over_target) {
538 		if (!vars->dropping) {
539 			vars->dropping = true;
540 			vars->drop_next = cobalt_control(now,
541 							 p->interval,
542 							 vars->rec_inv_sqrt);
543 		}
544 		if (!vars->count)
545 			vars->count = 1;
546 	} else if (vars->dropping) {
547 		vars->dropping = false;
548 	}
549 
550 	if (next_due && vars->dropping) {
551 		/* Use ECN mark if possible, otherwise drop */
552 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553 
554 		vars->count++;
555 		if (!vars->count)
556 			vars->count--;
557 		cobalt_invsqrt(vars);
558 		vars->drop_next = cobalt_control(vars->drop_next,
559 						 p->interval,
560 						 vars->rec_inv_sqrt);
561 		schedule = ktime_sub(now, vars->drop_next);
562 	} else {
563 		while (next_due) {
564 			vars->count--;
565 			cobalt_invsqrt(vars);
566 			vars->drop_next = cobalt_control(vars->drop_next,
567 							 p->interval,
568 							 vars->rec_inv_sqrt);
569 			schedule = ktime_sub(now, vars->drop_next);
570 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
571 		}
572 	}
573 
574 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
575 	if (vars->p_drop)
576 		drop |= (prandom_u32() < vars->p_drop);
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && !drop)
582 		vars->drop_next = now;
583 
584 	return drop;
585 }
586 
587 static void cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct;
593 
594 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
595 		return;
596 
597 	if (!nf_ct_get_tuple_skb(&tuple, skb))
598 		return;
599 
600 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
601 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
602 
603 	if (keys->ports.ports) {
604 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
605 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
606 	}
607 #endif
608 }
609 
610 /* Cake has several subtle multiple bit settings. In these cases you
611  *  would be matching triple isolate mode as well.
612  */
613 
614 static bool cake_dsrc(int flow_mode)
615 {
616 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
617 }
618 
619 static bool cake_ddst(int flow_mode)
620 {
621 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
622 }
623 
624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
625 		     int flow_mode, u16 flow_override, u16 host_override)
626 {
627 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
628 	u16 reduced_hash, srchost_idx, dsthost_idx;
629 	struct flow_keys keys, host_keys;
630 
631 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
632 		return 0;
633 
634 	/* If both overrides are set we can skip packet dissection entirely */
635 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
636 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
637 		goto skip_hash;
638 
639 	skb_flow_dissect_flow_keys(skb, &keys,
640 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
641 
642 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
643 		cake_update_flowkeys(&keys, skb);
644 
645 	/* flow_hash_from_keys() sorts the addresses by value, so we have
646 	 * to preserve their order in a separate data structure to treat
647 	 * src and dst host addresses as independently selectable.
648 	 */
649 	host_keys = keys;
650 	host_keys.ports.ports     = 0;
651 	host_keys.basic.ip_proto  = 0;
652 	host_keys.keyid.keyid     = 0;
653 	host_keys.tags.flow_label = 0;
654 
655 	switch (host_keys.control.addr_type) {
656 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
657 		host_keys.addrs.v4addrs.src = 0;
658 		dsthost_hash = flow_hash_from_keys(&host_keys);
659 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
660 		host_keys.addrs.v4addrs.dst = 0;
661 		srchost_hash = flow_hash_from_keys(&host_keys);
662 		break;
663 
664 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
665 		memset(&host_keys.addrs.v6addrs.src, 0,
666 		       sizeof(host_keys.addrs.v6addrs.src));
667 		dsthost_hash = flow_hash_from_keys(&host_keys);
668 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
669 		memset(&host_keys.addrs.v6addrs.dst, 0,
670 		       sizeof(host_keys.addrs.v6addrs.dst));
671 		srchost_hash = flow_hash_from_keys(&host_keys);
672 		break;
673 
674 	default:
675 		dsthost_hash = 0;
676 		srchost_hash = 0;
677 	}
678 
679 	/* This *must* be after the above switch, since as a
680 	 * side-effect it sorts the src and dst addresses.
681 	 */
682 	if (flow_mode & CAKE_FLOW_FLOWS)
683 		flow_hash = flow_hash_from_keys(&keys);
684 
685 skip_hash:
686 	if (flow_override)
687 		flow_hash = flow_override - 1;
688 	if (host_override) {
689 		dsthost_hash = host_override - 1;
690 		srchost_hash = host_override - 1;
691 	}
692 
693 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
694 		if (flow_mode & CAKE_FLOW_SRC_IP)
695 			flow_hash ^= srchost_hash;
696 
697 		if (flow_mode & CAKE_FLOW_DST_IP)
698 			flow_hash ^= dsthost_hash;
699 	}
700 
701 	reduced_hash = flow_hash % CAKE_QUEUES;
702 
703 	/* set-associative hashing */
704 	/* fast path if no hash collision (direct lookup succeeds) */
705 	if (likely(q->tags[reduced_hash] == flow_hash &&
706 		   q->flows[reduced_hash].set)) {
707 		q->way_directs++;
708 	} else {
709 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
710 		u32 outer_hash = reduced_hash - inner_hash;
711 		bool allocate_src = false;
712 		bool allocate_dst = false;
713 		u32 i, k;
714 
715 		/* check if any active queue in the set is reserved for
716 		 * this flow.
717 		 */
718 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
719 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
720 			if (q->tags[outer_hash + k] == flow_hash) {
721 				if (i)
722 					q->way_hits++;
723 
724 				if (!q->flows[outer_hash + k].set) {
725 					/* need to increment host refcnts */
726 					allocate_src = cake_dsrc(flow_mode);
727 					allocate_dst = cake_ddst(flow_mode);
728 				}
729 
730 				goto found;
731 			}
732 		}
733 
734 		/* no queue is reserved for this flow, look for an
735 		 * empty one.
736 		 */
737 		for (i = 0; i < CAKE_SET_WAYS;
738 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
739 			if (!q->flows[outer_hash + k].set) {
740 				q->way_misses++;
741 				allocate_src = cake_dsrc(flow_mode);
742 				allocate_dst = cake_ddst(flow_mode);
743 				goto found;
744 			}
745 		}
746 
747 		/* With no empty queues, default to the original
748 		 * queue, accept the collision, update the host tags.
749 		 */
750 		q->way_collisions++;
751 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
752 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
753 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
754 		}
755 		allocate_src = cake_dsrc(flow_mode);
756 		allocate_dst = cake_ddst(flow_mode);
757 found:
758 		/* reserve queue for future packets in same flow */
759 		reduced_hash = outer_hash + k;
760 		q->tags[reduced_hash] = flow_hash;
761 
762 		if (allocate_src) {
763 			srchost_idx = srchost_hash % CAKE_QUEUES;
764 			inner_hash = srchost_idx % CAKE_SET_WAYS;
765 			outer_hash = srchost_idx - inner_hash;
766 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
767 				i++, k = (k + 1) % CAKE_SET_WAYS) {
768 				if (q->hosts[outer_hash + k].srchost_tag ==
769 				    srchost_hash)
770 					goto found_src;
771 			}
772 			for (i = 0; i < CAKE_SET_WAYS;
773 				i++, k = (k + 1) % CAKE_SET_WAYS) {
774 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
775 					break;
776 			}
777 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
778 found_src:
779 			srchost_idx = outer_hash + k;
780 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
781 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
782 			q->flows[reduced_hash].srchost = srchost_idx;
783 		}
784 
785 		if (allocate_dst) {
786 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
787 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
788 			outer_hash = dsthost_idx - inner_hash;
789 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
790 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
791 				if (q->hosts[outer_hash + k].dsthost_tag ==
792 				    dsthost_hash)
793 					goto found_dst;
794 			}
795 			for (i = 0; i < CAKE_SET_WAYS;
796 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
797 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
798 					break;
799 			}
800 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
801 found_dst:
802 			dsthost_idx = outer_hash + k;
803 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
804 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
805 			q->flows[reduced_hash].dsthost = dsthost_idx;
806 		}
807 	}
808 
809 	return reduced_hash;
810 }
811 
812 /* helper functions : might be changed when/if skb use a standard list_head */
813 /* remove one skb from head of slot queue */
814 
815 static struct sk_buff *dequeue_head(struct cake_flow *flow)
816 {
817 	struct sk_buff *skb = flow->head;
818 
819 	if (skb) {
820 		flow->head = skb->next;
821 		skb_mark_not_on_list(skb);
822 	}
823 
824 	return skb;
825 }
826 
827 /* add skb to flow queue (tail add) */
828 
829 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
830 {
831 	if (!flow->head)
832 		flow->head = skb;
833 	else
834 		flow->tail->next = skb;
835 	flow->tail = skb;
836 	skb->next = NULL;
837 }
838 
839 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
840 				    struct ipv6hdr *buf)
841 {
842 	unsigned int offset = skb_network_offset(skb);
843 	struct iphdr *iph;
844 
845 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
846 
847 	if (!iph)
848 		return NULL;
849 
850 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
851 		return skb_header_pointer(skb, offset + iph->ihl * 4,
852 					  sizeof(struct ipv6hdr), buf);
853 
854 	else if (iph->version == 4)
855 		return iph;
856 
857 	else if (iph->version == 6)
858 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
859 					  buf);
860 
861 	return NULL;
862 }
863 
864 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
865 				      void *buf, unsigned int bufsize)
866 {
867 	unsigned int offset = skb_network_offset(skb);
868 	const struct ipv6hdr *ipv6h;
869 	const struct tcphdr *tcph;
870 	const struct iphdr *iph;
871 	struct ipv6hdr _ipv6h;
872 	struct tcphdr _tcph;
873 
874 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
875 
876 	if (!ipv6h)
877 		return NULL;
878 
879 	if (ipv6h->version == 4) {
880 		iph = (struct iphdr *)ipv6h;
881 		offset += iph->ihl * 4;
882 
883 		/* special-case 6in4 tunnelling, as that is a common way to get
884 		 * v6 connectivity in the home
885 		 */
886 		if (iph->protocol == IPPROTO_IPV6) {
887 			ipv6h = skb_header_pointer(skb, offset,
888 						   sizeof(_ipv6h), &_ipv6h);
889 
890 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
891 				return NULL;
892 
893 			offset += sizeof(struct ipv6hdr);
894 
895 		} else if (iph->protocol != IPPROTO_TCP) {
896 			return NULL;
897 		}
898 
899 	} else if (ipv6h->version == 6) {
900 		if (ipv6h->nexthdr != IPPROTO_TCP)
901 			return NULL;
902 
903 		offset += sizeof(struct ipv6hdr);
904 	} else {
905 		return NULL;
906 	}
907 
908 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
909 	if (!tcph)
910 		return NULL;
911 
912 	return skb_header_pointer(skb, offset,
913 				  min(__tcp_hdrlen(tcph), bufsize), buf);
914 }
915 
916 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
917 				   int code, int *oplen)
918 {
919 	/* inspired by tcp_parse_options in tcp_input.c */
920 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
921 	const u8 *ptr = (const u8 *)(tcph + 1);
922 
923 	while (length > 0) {
924 		int opcode = *ptr++;
925 		int opsize;
926 
927 		if (opcode == TCPOPT_EOL)
928 			break;
929 		if (opcode == TCPOPT_NOP) {
930 			length--;
931 			continue;
932 		}
933 		opsize = *ptr++;
934 		if (opsize < 2 || opsize > length)
935 			break;
936 
937 		if (opcode == code) {
938 			*oplen = opsize;
939 			return ptr;
940 		}
941 
942 		ptr += opsize - 2;
943 		length -= opsize;
944 	}
945 
946 	return NULL;
947 }
948 
949 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
950  * bytes than the other. In the case where both sequences ACKs bytes that the
951  * other doesn't, A is considered greater. DSACKs in A also makes A be
952  * considered greater.
953  *
954  * @return -1, 0 or 1 as normal compare functions
955  */
956 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
957 				  const struct tcphdr *tcph_b)
958 {
959 	const struct tcp_sack_block_wire *sack_a, *sack_b;
960 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
961 	u32 bytes_a = 0, bytes_b = 0;
962 	int oplen_a, oplen_b;
963 	bool first = true;
964 
965 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
966 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
967 
968 	/* pointers point to option contents */
969 	oplen_a -= TCPOLEN_SACK_BASE;
970 	oplen_b -= TCPOLEN_SACK_BASE;
971 
972 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
973 	    (!sack_b || oplen_b < sizeof(*sack_b)))
974 		return -1;
975 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
976 		 (!sack_a || oplen_a < sizeof(*sack_a)))
977 		return 1;
978 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
979 		 (!sack_b || oplen_b < sizeof(*sack_b)))
980 		return 0;
981 
982 	while (oplen_a >= sizeof(*sack_a)) {
983 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
984 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
985 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
986 		int oplen_tmp = oplen_b;
987 		bool found = false;
988 
989 		/* DSACK; always considered greater to prevent dropping */
990 		if (before(start_a, ack_seq_a))
991 			return -1;
992 
993 		bytes_a += end_a - start_a;
994 
995 		while (oplen_tmp >= sizeof(*sack_tmp)) {
996 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
997 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
998 
999 			/* first time through we count the total size */
1000 			if (first)
1001 				bytes_b += end_b - start_b;
1002 
1003 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1004 				found = true;
1005 				if (!first)
1006 					break;
1007 			}
1008 			oplen_tmp -= sizeof(*sack_tmp);
1009 			sack_tmp++;
1010 		}
1011 
1012 		if (!found)
1013 			return -1;
1014 
1015 		oplen_a -= sizeof(*sack_a);
1016 		sack_a++;
1017 		first = false;
1018 	}
1019 
1020 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1021 	 * either the SACKs are equal, or B SACKs more bytes.
1022 	 */
1023 	return bytes_b > bytes_a ? 1 : 0;
1024 }
1025 
1026 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1027 				 u32 *tsval, u32 *tsecr)
1028 {
1029 	const u8 *ptr;
1030 	int opsize;
1031 
1032 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1033 
1034 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1035 		*tsval = get_unaligned_be32(ptr);
1036 		*tsecr = get_unaligned_be32(ptr + 4);
1037 	}
1038 }
1039 
1040 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1041 			       u32 tstamp_new, u32 tsecr_new)
1042 {
1043 	/* inspired by tcp_parse_options in tcp_input.c */
1044 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1045 	const u8 *ptr = (const u8 *)(tcph + 1);
1046 	u32 tstamp, tsecr;
1047 
1048 	/* 3 reserved flags must be unset to avoid future breakage
1049 	 * ACK must be set
1050 	 * ECE/CWR are handled separately
1051 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1052 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1053 	 * 0x00C00000 = CWR/ECE (handled separately)
1054 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1055 	 */
1056 	if (((tcp_flag_word(tcph) &
1057 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1058 		return false;
1059 
1060 	while (length > 0) {
1061 		int opcode = *ptr++;
1062 		int opsize;
1063 
1064 		if (opcode == TCPOPT_EOL)
1065 			break;
1066 		if (opcode == TCPOPT_NOP) {
1067 			length--;
1068 			continue;
1069 		}
1070 		opsize = *ptr++;
1071 		if (opsize < 2 || opsize > length)
1072 			break;
1073 
1074 		switch (opcode) {
1075 		case TCPOPT_MD5SIG: /* doesn't influence state */
1076 			break;
1077 
1078 		case TCPOPT_SACK: /* stricter checking performed later */
1079 			if (opsize % 8 != 2)
1080 				return false;
1081 			break;
1082 
1083 		case TCPOPT_TIMESTAMP:
1084 			/* only drop timestamps lower than new */
1085 			if (opsize != TCPOLEN_TIMESTAMP)
1086 				return false;
1087 			tstamp = get_unaligned_be32(ptr);
1088 			tsecr = get_unaligned_be32(ptr + 4);
1089 			if (after(tstamp, tstamp_new) ||
1090 			    after(tsecr, tsecr_new))
1091 				return false;
1092 			break;
1093 
1094 		case TCPOPT_MSS:  /* these should only be set on SYN */
1095 		case TCPOPT_WINDOW:
1096 		case TCPOPT_SACK_PERM:
1097 		case TCPOPT_FASTOPEN:
1098 		case TCPOPT_EXP:
1099 		default: /* don't drop if any unknown options are present */
1100 			return false;
1101 		}
1102 
1103 		ptr += opsize - 2;
1104 		length -= opsize;
1105 	}
1106 
1107 	return true;
1108 }
1109 
1110 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1111 				       struct cake_flow *flow)
1112 {
1113 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1114 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1115 	struct sk_buff *skb_check, *skb_prev = NULL;
1116 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1117 	unsigned char _tcph[64], _tcph_check[64];
1118 	const struct tcphdr *tcph, *tcph_check;
1119 	const struct iphdr *iph, *iph_check;
1120 	struct ipv6hdr _iph, _iph_check;
1121 	const struct sk_buff *skb;
1122 	int seglen, num_found = 0;
1123 	u32 tstamp = 0, tsecr = 0;
1124 	__be32 elig_flags = 0;
1125 	int sack_comp;
1126 
1127 	/* no other possible ACKs to filter */
1128 	if (flow->head == flow->tail)
1129 		return NULL;
1130 
1131 	skb = flow->tail;
1132 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1133 	iph = cake_get_iphdr(skb, &_iph);
1134 	if (!tcph)
1135 		return NULL;
1136 
1137 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1138 
1139 	/* the 'triggering' packet need only have the ACK flag set.
1140 	 * also check that SYN is not set, as there won't be any previous ACKs.
1141 	 */
1142 	if ((tcp_flag_word(tcph) &
1143 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1144 		return NULL;
1145 
1146 	/* the 'triggering' ACK is at the tail of the queue, we have already
1147 	 * returned if it is the only packet in the flow. loop through the rest
1148 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1149 	 * triggering one.
1150 	 */
1151 	for (skb_check = flow->head;
1152 	     skb_check && skb_check != skb;
1153 	     skb_prev = skb_check, skb_check = skb_check->next) {
1154 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1155 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1156 					     sizeof(_tcph_check));
1157 
1158 		/* only TCP packets with matching 5-tuple are eligible, and only
1159 		 * drop safe headers
1160 		 */
1161 		if (!tcph_check || iph->version != iph_check->version ||
1162 		    tcph_check->source != tcph->source ||
1163 		    tcph_check->dest != tcph->dest)
1164 			continue;
1165 
1166 		if (iph_check->version == 4) {
1167 			if (iph_check->saddr != iph->saddr ||
1168 			    iph_check->daddr != iph->daddr)
1169 				continue;
1170 
1171 			seglen = ntohs(iph_check->tot_len) -
1172 				       (4 * iph_check->ihl);
1173 		} else if (iph_check->version == 6) {
1174 			ipv6h = (struct ipv6hdr *)iph;
1175 			ipv6h_check = (struct ipv6hdr *)iph_check;
1176 
1177 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1178 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1179 				continue;
1180 
1181 			seglen = ntohs(ipv6h_check->payload_len);
1182 		} else {
1183 			WARN_ON(1);  /* shouldn't happen */
1184 			continue;
1185 		}
1186 
1187 		/* If the ECE/CWR flags changed from the previous eligible
1188 		 * packet in the same flow, we should no longer be dropping that
1189 		 * previous packet as this would lose information.
1190 		 */
1191 		if (elig_ack && (tcp_flag_word(tcph_check) &
1192 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1193 			elig_ack = NULL;
1194 			elig_ack_prev = NULL;
1195 			num_found--;
1196 		}
1197 
1198 		/* Check TCP options and flags, don't drop ACKs with segment
1199 		 * data, and don't drop ACKs with a higher cumulative ACK
1200 		 * counter than the triggering packet. Check ACK seqno here to
1201 		 * avoid parsing SACK options of packets we are going to exclude
1202 		 * anyway.
1203 		 */
1204 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1205 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1206 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1207 			continue;
1208 
1209 		/* Check SACK options. The triggering packet must SACK more data
1210 		 * than the ACK under consideration, or SACK the same range but
1211 		 * have a larger cumulative ACK counter. The latter is a
1212 		 * pathological case, but is contained in the following check
1213 		 * anyway, just to be safe.
1214 		 */
1215 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1216 
1217 		if (sack_comp < 0 ||
1218 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1219 		     sack_comp == 0))
1220 			continue;
1221 
1222 		/* At this point we have found an eligible pure ACK to drop; if
1223 		 * we are in aggressive mode, we are done. Otherwise, keep
1224 		 * searching unless this is the second eligible ACK we
1225 		 * found.
1226 		 *
1227 		 * Since we want to drop ACK closest to the head of the queue,
1228 		 * save the first eligible ACK we find, even if we need to loop
1229 		 * again.
1230 		 */
1231 		if (!elig_ack) {
1232 			elig_ack = skb_check;
1233 			elig_ack_prev = skb_prev;
1234 			elig_flags = (tcp_flag_word(tcph_check)
1235 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1236 		}
1237 
1238 		if (num_found++ > 0)
1239 			goto found;
1240 	}
1241 
1242 	/* We made it through the queue without finding two eligible ACKs . If
1243 	 * we found a single eligible ACK we can drop it in aggressive mode if
1244 	 * we can guarantee that this does not interfere with ECN flag
1245 	 * information. We ensure this by dropping it only if the enqueued
1246 	 * packet is consecutive with the eligible ACK, and their flags match.
1247 	 */
1248 	if (elig_ack && aggressive && elig_ack->next == skb &&
1249 	    (elig_flags == (tcp_flag_word(tcph) &
1250 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1251 		goto found;
1252 
1253 	return NULL;
1254 
1255 found:
1256 	if (elig_ack_prev)
1257 		elig_ack_prev->next = elig_ack->next;
1258 	else
1259 		flow->head = elig_ack->next;
1260 
1261 	skb_mark_not_on_list(elig_ack);
1262 
1263 	return elig_ack;
1264 }
1265 
1266 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1267 {
1268 	avg -= avg >> shift;
1269 	avg += sample >> shift;
1270 	return avg;
1271 }
1272 
1273 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1274 {
1275 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1276 		len -= off;
1277 
1278 	if (q->max_netlen < len)
1279 		q->max_netlen = len;
1280 	if (q->min_netlen > len)
1281 		q->min_netlen = len;
1282 
1283 	len += q->rate_overhead;
1284 
1285 	if (len < q->rate_mpu)
1286 		len = q->rate_mpu;
1287 
1288 	if (q->atm_mode == CAKE_ATM_ATM) {
1289 		len += 47;
1290 		len /= 48;
1291 		len *= 53;
1292 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1293 		/* Add one byte per 64 bytes or part thereof.
1294 		 * This is conservative and easier to calculate than the
1295 		 * precise value.
1296 		 */
1297 		len += (len + 63) / 64;
1298 	}
1299 
1300 	if (q->max_adjlen < len)
1301 		q->max_adjlen = len;
1302 	if (q->min_adjlen > len)
1303 		q->min_adjlen = len;
1304 
1305 	return len;
1306 }
1307 
1308 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1309 {
1310 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1311 	unsigned int hdr_len, last_len = 0;
1312 	u32 off = skb_network_offset(skb);
1313 	u32 len = qdisc_pkt_len(skb);
1314 	u16 segs = 1;
1315 
1316 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1317 
1318 	if (!shinfo->gso_size)
1319 		return cake_calc_overhead(q, len, off);
1320 
1321 	/* borrowed from qdisc_pkt_len_init() */
1322 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1323 
1324 	/* + transport layer */
1325 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1326 						SKB_GSO_TCPV6))) {
1327 		const struct tcphdr *th;
1328 		struct tcphdr _tcphdr;
1329 
1330 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1331 					sizeof(_tcphdr), &_tcphdr);
1332 		if (likely(th))
1333 			hdr_len += __tcp_hdrlen(th);
1334 	} else {
1335 		struct udphdr _udphdr;
1336 
1337 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1338 				       sizeof(_udphdr), &_udphdr))
1339 			hdr_len += sizeof(struct udphdr);
1340 	}
1341 
1342 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1343 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1344 				    shinfo->gso_size);
1345 	else
1346 		segs = shinfo->gso_segs;
1347 
1348 	len = shinfo->gso_size + hdr_len;
1349 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1350 
1351 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1352 		cake_calc_overhead(q, last_len, off));
1353 }
1354 
1355 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1356 {
1357 	struct cake_heap_entry ii = q->overflow_heap[i];
1358 	struct cake_heap_entry jj = q->overflow_heap[j];
1359 
1360 	q->overflow_heap[i] = jj;
1361 	q->overflow_heap[j] = ii;
1362 
1363 	q->tins[ii.t].overflow_idx[ii.b] = j;
1364 	q->tins[jj.t].overflow_idx[jj.b] = i;
1365 }
1366 
1367 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1368 {
1369 	struct cake_heap_entry ii = q->overflow_heap[i];
1370 
1371 	return q->tins[ii.t].backlogs[ii.b];
1372 }
1373 
1374 static void cake_heapify(struct cake_sched_data *q, u16 i)
1375 {
1376 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1377 	u32 mb = cake_heap_get_backlog(q, i);
1378 	u32 m = i;
1379 
1380 	while (m < a) {
1381 		u32 l = m + m + 1;
1382 		u32 r = l + 1;
1383 
1384 		if (l < a) {
1385 			u32 lb = cake_heap_get_backlog(q, l);
1386 
1387 			if (lb > mb) {
1388 				m  = l;
1389 				mb = lb;
1390 			}
1391 		}
1392 
1393 		if (r < a) {
1394 			u32 rb = cake_heap_get_backlog(q, r);
1395 
1396 			if (rb > mb) {
1397 				m  = r;
1398 				mb = rb;
1399 			}
1400 		}
1401 
1402 		if (m != i) {
1403 			cake_heap_swap(q, i, m);
1404 			i = m;
1405 		} else {
1406 			break;
1407 		}
1408 	}
1409 }
1410 
1411 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1412 {
1413 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1414 		u16 p = (i - 1) >> 1;
1415 		u32 ib = cake_heap_get_backlog(q, i);
1416 		u32 pb = cake_heap_get_backlog(q, p);
1417 
1418 		if (ib > pb) {
1419 			cake_heap_swap(q, i, p);
1420 			i = p;
1421 		} else {
1422 			break;
1423 		}
1424 	}
1425 }
1426 
1427 static int cake_advance_shaper(struct cake_sched_data *q,
1428 			       struct cake_tin_data *b,
1429 			       struct sk_buff *skb,
1430 			       ktime_t now, bool drop)
1431 {
1432 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1433 
1434 	/* charge packet bandwidth to this tin
1435 	 * and to the global shaper.
1436 	 */
1437 	if (q->rate_ns) {
1438 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1439 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1440 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1441 
1442 		if (ktime_before(b->time_next_packet, now))
1443 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1444 							   tin_dur);
1445 
1446 		else if (ktime_before(b->time_next_packet,
1447 				      ktime_add_ns(now, tin_dur)))
1448 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1449 
1450 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1451 						   global_dur);
1452 		if (!drop)
1453 			q->failsafe_next_packet = \
1454 				ktime_add_ns(q->failsafe_next_packet,
1455 					     failsafe_dur);
1456 	}
1457 	return len;
1458 }
1459 
1460 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1461 {
1462 	struct cake_sched_data *q = qdisc_priv(sch);
1463 	ktime_t now = ktime_get();
1464 	u32 idx = 0, tin = 0, len;
1465 	struct cake_heap_entry qq;
1466 	struct cake_tin_data *b;
1467 	struct cake_flow *flow;
1468 	struct sk_buff *skb;
1469 
1470 	if (!q->overflow_timeout) {
1471 		int i;
1472 		/* Build fresh max-heap */
1473 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1474 			cake_heapify(q, i);
1475 	}
1476 	q->overflow_timeout = 65535;
1477 
1478 	/* select longest queue for pruning */
1479 	qq  = q->overflow_heap[0];
1480 	tin = qq.t;
1481 	idx = qq.b;
1482 
1483 	b = &q->tins[tin];
1484 	flow = &b->flows[idx];
1485 	skb = dequeue_head(flow);
1486 	if (unlikely(!skb)) {
1487 		/* heap has gone wrong, rebuild it next time */
1488 		q->overflow_timeout = 0;
1489 		return idx + (tin << 16);
1490 	}
1491 
1492 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1493 		b->unresponsive_flow_count++;
1494 
1495 	len = qdisc_pkt_len(skb);
1496 	q->buffer_used      -= skb->truesize;
1497 	b->backlogs[idx]    -= len;
1498 	b->tin_backlog      -= len;
1499 	sch->qstats.backlog -= len;
1500 	qdisc_tree_reduce_backlog(sch, 1, len);
1501 
1502 	flow->dropped++;
1503 	b->tin_dropped++;
1504 	sch->qstats.drops++;
1505 
1506 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1507 		cake_advance_shaper(q, b, skb, now, true);
1508 
1509 	__qdisc_drop(skb, to_free);
1510 	sch->q.qlen--;
1511 
1512 	cake_heapify(q, 0);
1513 
1514 	return idx + (tin << 16);
1515 }
1516 
1517 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1518 {
1519 	int wlen = skb_network_offset(skb);
1520 	u8 dscp;
1521 
1522 	switch (tc_skb_protocol(skb)) {
1523 	case htons(ETH_P_IP):
1524 		wlen += sizeof(struct iphdr);
1525 		if (!pskb_may_pull(skb, wlen) ||
1526 		    skb_try_make_writable(skb, wlen))
1527 			return 0;
1528 
1529 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1530 		if (wash && dscp)
1531 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1532 		return dscp;
1533 
1534 	case htons(ETH_P_IPV6):
1535 		wlen += sizeof(struct ipv6hdr);
1536 		if (!pskb_may_pull(skb, wlen) ||
1537 		    skb_try_make_writable(skb, wlen))
1538 			return 0;
1539 
1540 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1541 		if (wash && dscp)
1542 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1543 		return dscp;
1544 
1545 	case htons(ETH_P_ARP):
1546 		return 0x38;  /* CS7 - Net Control */
1547 
1548 	default:
1549 		/* If there is no Diffserv field, treat as best-effort */
1550 		return 0;
1551 	}
1552 }
1553 
1554 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1555 					     struct sk_buff *skb)
1556 {
1557 	struct cake_sched_data *q = qdisc_priv(sch);
1558 	u32 tin, mark;
1559 	u8 dscp;
1560 
1561 	/* Tin selection: Default to diffserv-based selection, allow overriding
1562 	 * using firewall marks or skb->priority.
1563 	 */
1564 	dscp = cake_handle_diffserv(skb,
1565 				    q->rate_flags & CAKE_FLAG_WASH);
1566 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1567 
1568 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1569 		tin = 0;
1570 
1571 	else if (mark && mark <= q->tin_cnt)
1572 		tin = q->tin_order[mark - 1];
1573 
1574 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1575 		 TC_H_MIN(skb->priority) > 0 &&
1576 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1577 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1578 
1579 	else {
1580 		tin = q->tin_index[dscp];
1581 
1582 		if (unlikely(tin >= q->tin_cnt))
1583 			tin = 0;
1584 	}
1585 
1586 	return &q->tins[tin];
1587 }
1588 
1589 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1590 			 struct sk_buff *skb, int flow_mode, int *qerr)
1591 {
1592 	struct cake_sched_data *q = qdisc_priv(sch);
1593 	struct tcf_proto *filter;
1594 	struct tcf_result res;
1595 	u16 flow = 0, host = 0;
1596 	int result;
1597 
1598 	filter = rcu_dereference_bh(q->filter_list);
1599 	if (!filter)
1600 		goto hash;
1601 
1602 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1603 	result = tcf_classify(skb, filter, &res, false);
1604 
1605 	if (result >= 0) {
1606 #ifdef CONFIG_NET_CLS_ACT
1607 		switch (result) {
1608 		case TC_ACT_STOLEN:
1609 		case TC_ACT_QUEUED:
1610 		case TC_ACT_TRAP:
1611 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1612 			/* fall through */
1613 		case TC_ACT_SHOT:
1614 			return 0;
1615 		}
1616 #endif
1617 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1618 			flow = TC_H_MIN(res.classid);
1619 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1620 			host = TC_H_MAJ(res.classid) >> 16;
1621 	}
1622 hash:
1623 	*t = cake_select_tin(sch, skb);
1624 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1625 }
1626 
1627 static void cake_reconfigure(struct Qdisc *sch);
1628 
1629 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1630 			struct sk_buff **to_free)
1631 {
1632 	struct cake_sched_data *q = qdisc_priv(sch);
1633 	int len = qdisc_pkt_len(skb);
1634 	int uninitialized_var(ret);
1635 	struct sk_buff *ack = NULL;
1636 	ktime_t now = ktime_get();
1637 	struct cake_tin_data *b;
1638 	struct cake_flow *flow;
1639 	u32 idx;
1640 
1641 	/* choose flow to insert into */
1642 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1643 	if (idx == 0) {
1644 		if (ret & __NET_XMIT_BYPASS)
1645 			qdisc_qstats_drop(sch);
1646 		__qdisc_drop(skb, to_free);
1647 		return ret;
1648 	}
1649 	idx--;
1650 	flow = &b->flows[idx];
1651 
1652 	/* ensure shaper state isn't stale */
1653 	if (!b->tin_backlog) {
1654 		if (ktime_before(b->time_next_packet, now))
1655 			b->time_next_packet = now;
1656 
1657 		if (!sch->q.qlen) {
1658 			if (ktime_before(q->time_next_packet, now)) {
1659 				q->failsafe_next_packet = now;
1660 				q->time_next_packet = now;
1661 			} else if (ktime_after(q->time_next_packet, now) &&
1662 				   ktime_after(q->failsafe_next_packet, now)) {
1663 				u64 next = \
1664 					min(ktime_to_ns(q->time_next_packet),
1665 					    ktime_to_ns(
1666 						   q->failsafe_next_packet));
1667 				sch->qstats.overlimits++;
1668 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1669 			}
1670 		}
1671 	}
1672 
1673 	if (unlikely(len > b->max_skblen))
1674 		b->max_skblen = len;
1675 
1676 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1677 		struct sk_buff *segs, *nskb;
1678 		netdev_features_t features = netif_skb_features(skb);
1679 		unsigned int slen = 0, numsegs = 0;
1680 
1681 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1682 		if (IS_ERR_OR_NULL(segs))
1683 			return qdisc_drop(skb, sch, to_free);
1684 
1685 		skb_list_walk_safe(segs, segs, nskb) {
1686 			skb_mark_not_on_list(segs);
1687 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1688 			cobalt_set_enqueue_time(segs, now);
1689 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1690 									  segs);
1691 			flow_queue_add(flow, segs);
1692 
1693 			sch->q.qlen++;
1694 			numsegs++;
1695 			slen += segs->len;
1696 			q->buffer_used += segs->truesize;
1697 			b->packets++;
1698 		}
1699 
1700 		/* stats */
1701 		b->bytes	    += slen;
1702 		b->backlogs[idx]    += slen;
1703 		b->tin_backlog      += slen;
1704 		sch->qstats.backlog += slen;
1705 		q->avg_window_bytes += slen;
1706 
1707 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1708 		consume_skb(skb);
1709 	} else {
1710 		/* not splitting */
1711 		cobalt_set_enqueue_time(skb, now);
1712 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1713 		flow_queue_add(flow, skb);
1714 
1715 		if (q->ack_filter)
1716 			ack = cake_ack_filter(q, flow);
1717 
1718 		if (ack) {
1719 			b->ack_drops++;
1720 			sch->qstats.drops++;
1721 			b->bytes += qdisc_pkt_len(ack);
1722 			len -= qdisc_pkt_len(ack);
1723 			q->buffer_used += skb->truesize - ack->truesize;
1724 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1725 				cake_advance_shaper(q, b, ack, now, true);
1726 
1727 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1728 			consume_skb(ack);
1729 		} else {
1730 			sch->q.qlen++;
1731 			q->buffer_used      += skb->truesize;
1732 		}
1733 
1734 		/* stats */
1735 		b->packets++;
1736 		b->bytes	    += len;
1737 		b->backlogs[idx]    += len;
1738 		b->tin_backlog      += len;
1739 		sch->qstats.backlog += len;
1740 		q->avg_window_bytes += len;
1741 	}
1742 
1743 	if (q->overflow_timeout)
1744 		cake_heapify_up(q, b->overflow_idx[idx]);
1745 
1746 	/* incoming bandwidth capacity estimate */
1747 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1748 		u64 packet_interval = \
1749 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1750 
1751 		if (packet_interval > NSEC_PER_SEC)
1752 			packet_interval = NSEC_PER_SEC;
1753 
1754 		/* filter out short-term bursts, eg. wifi aggregation */
1755 		q->avg_packet_interval = \
1756 			cake_ewma(q->avg_packet_interval,
1757 				  packet_interval,
1758 				  (packet_interval > q->avg_packet_interval ?
1759 					  2 : 8));
1760 
1761 		q->last_packet_time = now;
1762 
1763 		if (packet_interval > q->avg_packet_interval) {
1764 			u64 window_interval = \
1765 				ktime_to_ns(ktime_sub(now,
1766 						      q->avg_window_begin));
1767 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1768 
1769 			b = div64_u64(b, window_interval);
1770 			q->avg_peak_bandwidth =
1771 				cake_ewma(q->avg_peak_bandwidth, b,
1772 					  b > q->avg_peak_bandwidth ? 2 : 8);
1773 			q->avg_window_bytes = 0;
1774 			q->avg_window_begin = now;
1775 
1776 			if (ktime_after(now,
1777 					ktime_add_ms(q->last_reconfig_time,
1778 						     250))) {
1779 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1780 				cake_reconfigure(sch);
1781 			}
1782 		}
1783 	} else {
1784 		q->avg_window_bytes = 0;
1785 		q->last_packet_time = now;
1786 	}
1787 
1788 	/* flowchain */
1789 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1790 		struct cake_host *srchost = &b->hosts[flow->srchost];
1791 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1792 		u16 host_load = 1;
1793 
1794 		if (!flow->set) {
1795 			list_add_tail(&flow->flowchain, &b->new_flows);
1796 		} else {
1797 			b->decaying_flow_count--;
1798 			list_move_tail(&flow->flowchain, &b->new_flows);
1799 		}
1800 		flow->set = CAKE_SET_SPARSE;
1801 		b->sparse_flow_count++;
1802 
1803 		if (cake_dsrc(q->flow_mode))
1804 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1805 
1806 		if (cake_ddst(q->flow_mode))
1807 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1808 
1809 		flow->deficit = (b->flow_quantum *
1810 				 quantum_div[host_load]) >> 16;
1811 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1812 		struct cake_host *srchost = &b->hosts[flow->srchost];
1813 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1814 
1815 		/* this flow was empty, accounted as a sparse flow, but actually
1816 		 * in the bulk rotation.
1817 		 */
1818 		flow->set = CAKE_SET_BULK;
1819 		b->sparse_flow_count--;
1820 		b->bulk_flow_count++;
1821 
1822 		if (cake_dsrc(q->flow_mode))
1823 			srchost->srchost_bulk_flow_count++;
1824 
1825 		if (cake_ddst(q->flow_mode))
1826 			dsthost->dsthost_bulk_flow_count++;
1827 
1828 	}
1829 
1830 	if (q->buffer_used > q->buffer_max_used)
1831 		q->buffer_max_used = q->buffer_used;
1832 
1833 	if (q->buffer_used > q->buffer_limit) {
1834 		u32 dropped = 0;
1835 
1836 		while (q->buffer_used > q->buffer_limit) {
1837 			dropped++;
1838 			cake_drop(sch, to_free);
1839 		}
1840 		b->drop_overlimit += dropped;
1841 	}
1842 	return NET_XMIT_SUCCESS;
1843 }
1844 
1845 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1846 {
1847 	struct cake_sched_data *q = qdisc_priv(sch);
1848 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1849 	struct cake_flow *flow = &b->flows[q->cur_flow];
1850 	struct sk_buff *skb = NULL;
1851 	u32 len;
1852 
1853 	if (flow->head) {
1854 		skb = dequeue_head(flow);
1855 		len = qdisc_pkt_len(skb);
1856 		b->backlogs[q->cur_flow] -= len;
1857 		b->tin_backlog		 -= len;
1858 		sch->qstats.backlog      -= len;
1859 		q->buffer_used		 -= skb->truesize;
1860 		sch->q.qlen--;
1861 
1862 		if (q->overflow_timeout)
1863 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1864 	}
1865 	return skb;
1866 }
1867 
1868 /* Discard leftover packets from a tin no longer in use. */
1869 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1870 {
1871 	struct cake_sched_data *q = qdisc_priv(sch);
1872 	struct sk_buff *skb;
1873 
1874 	q->cur_tin = tin;
1875 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1876 		while (!!(skb = cake_dequeue_one(sch)))
1877 			kfree_skb(skb);
1878 }
1879 
1880 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1881 {
1882 	struct cake_sched_data *q = qdisc_priv(sch);
1883 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1884 	struct cake_host *srchost, *dsthost;
1885 	ktime_t now = ktime_get();
1886 	struct cake_flow *flow;
1887 	struct list_head *head;
1888 	bool first_flow = true;
1889 	struct sk_buff *skb;
1890 	u16 host_load;
1891 	u64 delay;
1892 	u32 len;
1893 
1894 begin:
1895 	if (!sch->q.qlen)
1896 		return NULL;
1897 
1898 	/* global hard shaper */
1899 	if (ktime_after(q->time_next_packet, now) &&
1900 	    ktime_after(q->failsafe_next_packet, now)) {
1901 		u64 next = min(ktime_to_ns(q->time_next_packet),
1902 			       ktime_to_ns(q->failsafe_next_packet));
1903 
1904 		sch->qstats.overlimits++;
1905 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1906 		return NULL;
1907 	}
1908 
1909 	/* Choose a class to work on. */
1910 	if (!q->rate_ns) {
1911 		/* In unlimited mode, can't rely on shaper timings, just balance
1912 		 * with DRR
1913 		 */
1914 		bool wrapped = false, empty = true;
1915 
1916 		while (b->tin_deficit < 0 ||
1917 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1918 			if (b->tin_deficit <= 0)
1919 				b->tin_deficit += b->tin_quantum;
1920 			if (b->sparse_flow_count + b->bulk_flow_count)
1921 				empty = false;
1922 
1923 			q->cur_tin++;
1924 			b++;
1925 			if (q->cur_tin >= q->tin_cnt) {
1926 				q->cur_tin = 0;
1927 				b = q->tins;
1928 
1929 				if (wrapped) {
1930 					/* It's possible for q->qlen to be
1931 					 * nonzero when we actually have no
1932 					 * packets anywhere.
1933 					 */
1934 					if (empty)
1935 						return NULL;
1936 				} else {
1937 					wrapped = true;
1938 				}
1939 			}
1940 		}
1941 	} else {
1942 		/* In shaped mode, choose:
1943 		 * - Highest-priority tin with queue and meeting schedule, or
1944 		 * - The earliest-scheduled tin with queue.
1945 		 */
1946 		ktime_t best_time = KTIME_MAX;
1947 		int tin, best_tin = 0;
1948 
1949 		for (tin = 0; tin < q->tin_cnt; tin++) {
1950 			b = q->tins + tin;
1951 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1952 				ktime_t time_to_pkt = \
1953 					ktime_sub(b->time_next_packet, now);
1954 
1955 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1956 				    ktime_compare(time_to_pkt,
1957 						  best_time) <= 0) {
1958 					best_time = time_to_pkt;
1959 					best_tin = tin;
1960 				}
1961 			}
1962 		}
1963 
1964 		q->cur_tin = best_tin;
1965 		b = q->tins + best_tin;
1966 
1967 		/* No point in going further if no packets to deliver. */
1968 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1969 			return NULL;
1970 	}
1971 
1972 retry:
1973 	/* service this class */
1974 	head = &b->decaying_flows;
1975 	if (!first_flow || list_empty(head)) {
1976 		head = &b->new_flows;
1977 		if (list_empty(head)) {
1978 			head = &b->old_flows;
1979 			if (unlikely(list_empty(head))) {
1980 				head = &b->decaying_flows;
1981 				if (unlikely(list_empty(head)))
1982 					goto begin;
1983 			}
1984 		}
1985 	}
1986 	flow = list_first_entry(head, struct cake_flow, flowchain);
1987 	q->cur_flow = flow - b->flows;
1988 	first_flow = false;
1989 
1990 	/* triple isolation (modified DRR++) */
1991 	srchost = &b->hosts[flow->srchost];
1992 	dsthost = &b->hosts[flow->dsthost];
1993 	host_load = 1;
1994 
1995 	/* flow isolation (DRR++) */
1996 	if (flow->deficit <= 0) {
1997 		/* Keep all flows with deficits out of the sparse and decaying
1998 		 * rotations.  No non-empty flow can go into the decaying
1999 		 * rotation, so they can't get deficits
2000 		 */
2001 		if (flow->set == CAKE_SET_SPARSE) {
2002 			if (flow->head) {
2003 				b->sparse_flow_count--;
2004 				b->bulk_flow_count++;
2005 
2006 				if (cake_dsrc(q->flow_mode))
2007 					srchost->srchost_bulk_flow_count++;
2008 
2009 				if (cake_ddst(q->flow_mode))
2010 					dsthost->dsthost_bulk_flow_count++;
2011 
2012 				flow->set = CAKE_SET_BULK;
2013 			} else {
2014 				/* we've moved it to the bulk rotation for
2015 				 * correct deficit accounting but we still want
2016 				 * to count it as a sparse flow, not a bulk one.
2017 				 */
2018 				flow->set = CAKE_SET_SPARSE_WAIT;
2019 			}
2020 		}
2021 
2022 		if (cake_dsrc(q->flow_mode))
2023 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2024 
2025 		if (cake_ddst(q->flow_mode))
2026 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2027 
2028 		WARN_ON(host_load > CAKE_QUEUES);
2029 
2030 		/* The shifted prandom_u32() is a way to apply dithering to
2031 		 * avoid accumulating roundoff errors
2032 		 */
2033 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2034 				  (prandom_u32() >> 16)) >> 16;
2035 		list_move_tail(&flow->flowchain, &b->old_flows);
2036 
2037 		goto retry;
2038 	}
2039 
2040 	/* Retrieve a packet via the AQM */
2041 	while (1) {
2042 		skb = cake_dequeue_one(sch);
2043 		if (!skb) {
2044 			/* this queue was actually empty */
2045 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2046 				b->unresponsive_flow_count--;
2047 
2048 			if (flow->cvars.p_drop || flow->cvars.count ||
2049 			    ktime_before(now, flow->cvars.drop_next)) {
2050 				/* keep in the flowchain until the state has
2051 				 * decayed to rest
2052 				 */
2053 				list_move_tail(&flow->flowchain,
2054 					       &b->decaying_flows);
2055 				if (flow->set == CAKE_SET_BULK) {
2056 					b->bulk_flow_count--;
2057 
2058 					if (cake_dsrc(q->flow_mode))
2059 						srchost->srchost_bulk_flow_count--;
2060 
2061 					if (cake_ddst(q->flow_mode))
2062 						dsthost->dsthost_bulk_flow_count--;
2063 
2064 					b->decaying_flow_count++;
2065 				} else if (flow->set == CAKE_SET_SPARSE ||
2066 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2067 					b->sparse_flow_count--;
2068 					b->decaying_flow_count++;
2069 				}
2070 				flow->set = CAKE_SET_DECAYING;
2071 			} else {
2072 				/* remove empty queue from the flowchain */
2073 				list_del_init(&flow->flowchain);
2074 				if (flow->set == CAKE_SET_SPARSE ||
2075 				    flow->set == CAKE_SET_SPARSE_WAIT)
2076 					b->sparse_flow_count--;
2077 				else if (flow->set == CAKE_SET_BULK) {
2078 					b->bulk_flow_count--;
2079 
2080 					if (cake_dsrc(q->flow_mode))
2081 						srchost->srchost_bulk_flow_count--;
2082 
2083 					if (cake_ddst(q->flow_mode))
2084 						dsthost->dsthost_bulk_flow_count--;
2085 
2086 				} else
2087 					b->decaying_flow_count--;
2088 
2089 				flow->set = CAKE_SET_NONE;
2090 			}
2091 			goto begin;
2092 		}
2093 
2094 		/* Last packet in queue may be marked, shouldn't be dropped */
2095 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2096 					(b->bulk_flow_count *
2097 					 !!(q->rate_flags &
2098 					    CAKE_FLAG_INGRESS))) ||
2099 		    !flow->head)
2100 			break;
2101 
2102 		/* drop this packet, get another one */
2103 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2104 			len = cake_advance_shaper(q, b, skb,
2105 						  now, true);
2106 			flow->deficit -= len;
2107 			b->tin_deficit -= len;
2108 		}
2109 		flow->dropped++;
2110 		b->tin_dropped++;
2111 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2112 		qdisc_qstats_drop(sch);
2113 		kfree_skb(skb);
2114 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2115 			goto retry;
2116 	}
2117 
2118 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2119 	qdisc_bstats_update(sch, skb);
2120 
2121 	/* collect delay stats */
2122 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2123 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2124 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2125 				  delay > b->peak_delay ? 2 : 8);
2126 	b->base_delay = cake_ewma(b->base_delay, delay,
2127 				  delay < b->base_delay ? 2 : 8);
2128 
2129 	len = cake_advance_shaper(q, b, skb, now, false);
2130 	flow->deficit -= len;
2131 	b->tin_deficit -= len;
2132 
2133 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2134 		u64 next = min(ktime_to_ns(q->time_next_packet),
2135 			       ktime_to_ns(q->failsafe_next_packet));
2136 
2137 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2138 	} else if (!sch->q.qlen) {
2139 		int i;
2140 
2141 		for (i = 0; i < q->tin_cnt; i++) {
2142 			if (q->tins[i].decaying_flow_count) {
2143 				ktime_t next = \
2144 					ktime_add_ns(now,
2145 						     q->tins[i].cparams.target);
2146 
2147 				qdisc_watchdog_schedule_ns(&q->watchdog,
2148 							   ktime_to_ns(next));
2149 				break;
2150 			}
2151 		}
2152 	}
2153 
2154 	if (q->overflow_timeout)
2155 		q->overflow_timeout--;
2156 
2157 	return skb;
2158 }
2159 
2160 static void cake_reset(struct Qdisc *sch)
2161 {
2162 	u32 c;
2163 
2164 	for (c = 0; c < CAKE_MAX_TINS; c++)
2165 		cake_clear_tin(sch, c);
2166 }
2167 
2168 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2169 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2170 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2171 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2172 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2173 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2174 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2175 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2176 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2177 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2178 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2179 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2180 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2181 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2182 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2183 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2184 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2185 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2186 };
2187 
2188 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2189 			  u64 target_ns, u64 rtt_est_ns)
2190 {
2191 	/* convert byte-rate into time-per-byte
2192 	 * so it will always unwedge in reasonable time.
2193 	 */
2194 	static const u64 MIN_RATE = 64;
2195 	u32 byte_target = mtu;
2196 	u64 byte_target_ns;
2197 	u8  rate_shft = 0;
2198 	u64 rate_ns = 0;
2199 
2200 	b->flow_quantum = 1514;
2201 	if (rate) {
2202 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2203 		rate_shft = 34;
2204 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2205 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2206 		while (!!(rate_ns >> 34)) {
2207 			rate_ns >>= 1;
2208 			rate_shft--;
2209 		}
2210 	} /* else unlimited, ie. zero delay */
2211 
2212 	b->tin_rate_bps  = rate;
2213 	b->tin_rate_ns   = rate_ns;
2214 	b->tin_rate_shft = rate_shft;
2215 
2216 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2217 
2218 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2219 	b->cparams.interval = max(rtt_est_ns +
2220 				     b->cparams.target - target_ns,
2221 				     b->cparams.target * 2);
2222 	b->cparams.mtu_time = byte_target_ns;
2223 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2224 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2225 }
2226 
2227 static int cake_config_besteffort(struct Qdisc *sch)
2228 {
2229 	struct cake_sched_data *q = qdisc_priv(sch);
2230 	struct cake_tin_data *b = &q->tins[0];
2231 	u32 mtu = psched_mtu(qdisc_dev(sch));
2232 	u64 rate = q->rate_bps;
2233 
2234 	q->tin_cnt = 1;
2235 
2236 	q->tin_index = besteffort;
2237 	q->tin_order = normal_order;
2238 
2239 	cake_set_rate(b, rate, mtu,
2240 		      us_to_ns(q->target), us_to_ns(q->interval));
2241 	b->tin_quantum = 65535;
2242 
2243 	return 0;
2244 }
2245 
2246 static int cake_config_precedence(struct Qdisc *sch)
2247 {
2248 	/* convert high-level (user visible) parameters into internal format */
2249 	struct cake_sched_data *q = qdisc_priv(sch);
2250 	u32 mtu = psched_mtu(qdisc_dev(sch));
2251 	u64 rate = q->rate_bps;
2252 	u32 quantum = 256;
2253 	u32 i;
2254 
2255 	q->tin_cnt = 8;
2256 	q->tin_index = precedence;
2257 	q->tin_order = normal_order;
2258 
2259 	for (i = 0; i < q->tin_cnt; i++) {
2260 		struct cake_tin_data *b = &q->tins[i];
2261 
2262 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2263 			      us_to_ns(q->interval));
2264 
2265 		b->tin_quantum = max_t(u16, 1U, quantum);
2266 
2267 		/* calculate next class's parameters */
2268 		rate  *= 7;
2269 		rate >>= 3;
2270 
2271 		quantum  *= 7;
2272 		quantum >>= 3;
2273 	}
2274 
2275 	return 0;
2276 }
2277 
2278 /*	List of known Diffserv codepoints:
2279  *
2280  *	Least Effort (CS1)
2281  *	Best Effort (CS0)
2282  *	Max Reliability & LLT "Lo" (TOS1)
2283  *	Max Throughput (TOS2)
2284  *	Min Delay (TOS4)
2285  *	LLT "La" (TOS5)
2286  *	Assured Forwarding 1 (AF1x) - x3
2287  *	Assured Forwarding 2 (AF2x) - x3
2288  *	Assured Forwarding 3 (AF3x) - x3
2289  *	Assured Forwarding 4 (AF4x) - x3
2290  *	Precedence Class 2 (CS2)
2291  *	Precedence Class 3 (CS3)
2292  *	Precedence Class 4 (CS4)
2293  *	Precedence Class 5 (CS5)
2294  *	Precedence Class 6 (CS6)
2295  *	Precedence Class 7 (CS7)
2296  *	Voice Admit (VA)
2297  *	Expedited Forwarding (EF)
2298 
2299  *	Total 25 codepoints.
2300  */
2301 
2302 /*	List of traffic classes in RFC 4594:
2303  *		(roughly descending order of contended priority)
2304  *		(roughly ascending order of uncontended throughput)
2305  *
2306  *	Network Control (CS6,CS7)      - routing traffic
2307  *	Telephony (EF,VA)         - aka. VoIP streams
2308  *	Signalling (CS5)               - VoIP setup
2309  *	Multimedia Conferencing (AF4x) - aka. video calls
2310  *	Realtime Interactive (CS4)     - eg. games
2311  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2312  *	Broadcast Video (CS3)
2313  *	Low Latency Data (AF2x,TOS4)      - eg. database
2314  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2315  *	Standard Service (CS0 & unrecognised codepoints)
2316  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2317  *	Low Priority Data (CS1)           - eg. BitTorrent
2318 
2319  *	Total 12 traffic classes.
2320  */
2321 
2322 static int cake_config_diffserv8(struct Qdisc *sch)
2323 {
2324 /*	Pruned list of traffic classes for typical applications:
2325  *
2326  *		Network Control          (CS6, CS7)
2327  *		Minimum Latency          (EF, VA, CS5, CS4)
2328  *		Interactive Shell        (CS2, TOS1)
2329  *		Low Latency Transactions (AF2x, TOS4)
2330  *		Video Streaming          (AF4x, AF3x, CS3)
2331  *		Bog Standard             (CS0 etc.)
2332  *		High Throughput          (AF1x, TOS2)
2333  *		Background Traffic       (CS1)
2334  *
2335  *		Total 8 traffic classes.
2336  */
2337 
2338 	struct cake_sched_data *q = qdisc_priv(sch);
2339 	u32 mtu = psched_mtu(qdisc_dev(sch));
2340 	u64 rate = q->rate_bps;
2341 	u32 quantum = 256;
2342 	u32 i;
2343 
2344 	q->tin_cnt = 8;
2345 
2346 	/* codepoint to class mapping */
2347 	q->tin_index = diffserv8;
2348 	q->tin_order = normal_order;
2349 
2350 	/* class characteristics */
2351 	for (i = 0; i < q->tin_cnt; i++) {
2352 		struct cake_tin_data *b = &q->tins[i];
2353 
2354 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2355 			      us_to_ns(q->interval));
2356 
2357 		b->tin_quantum = max_t(u16, 1U, quantum);
2358 
2359 		/* calculate next class's parameters */
2360 		rate  *= 7;
2361 		rate >>= 3;
2362 
2363 		quantum  *= 7;
2364 		quantum >>= 3;
2365 	}
2366 
2367 	return 0;
2368 }
2369 
2370 static int cake_config_diffserv4(struct Qdisc *sch)
2371 {
2372 /*  Further pruned list of traffic classes for four-class system:
2373  *
2374  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2375  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2376  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2377  *	    Background Traffic (CS1)
2378  *
2379  *		Total 4 traffic classes.
2380  */
2381 
2382 	struct cake_sched_data *q = qdisc_priv(sch);
2383 	u32 mtu = psched_mtu(qdisc_dev(sch));
2384 	u64 rate = q->rate_bps;
2385 	u32 quantum = 1024;
2386 
2387 	q->tin_cnt = 4;
2388 
2389 	/* codepoint to class mapping */
2390 	q->tin_index = diffserv4;
2391 	q->tin_order = bulk_order;
2392 
2393 	/* class characteristics */
2394 	cake_set_rate(&q->tins[0], rate, mtu,
2395 		      us_to_ns(q->target), us_to_ns(q->interval));
2396 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397 		      us_to_ns(q->target), us_to_ns(q->interval));
2398 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2399 		      us_to_ns(q->target), us_to_ns(q->interval));
2400 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2401 		      us_to_ns(q->target), us_to_ns(q->interval));
2402 
2403 	/* bandwidth-sharing weights */
2404 	q->tins[0].tin_quantum = quantum;
2405 	q->tins[1].tin_quantum = quantum >> 4;
2406 	q->tins[2].tin_quantum = quantum >> 1;
2407 	q->tins[3].tin_quantum = quantum >> 2;
2408 
2409 	return 0;
2410 }
2411 
2412 static int cake_config_diffserv3(struct Qdisc *sch)
2413 {
2414 /*  Simplified Diffserv structure with 3 tins.
2415  *		Low Priority		(CS1)
2416  *		Best Effort
2417  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2418  */
2419 	struct cake_sched_data *q = qdisc_priv(sch);
2420 	u32 mtu = psched_mtu(qdisc_dev(sch));
2421 	u64 rate = q->rate_bps;
2422 	u32 quantum = 1024;
2423 
2424 	q->tin_cnt = 3;
2425 
2426 	/* codepoint to class mapping */
2427 	q->tin_index = diffserv3;
2428 	q->tin_order = bulk_order;
2429 
2430 	/* class characteristics */
2431 	cake_set_rate(&q->tins[0], rate, mtu,
2432 		      us_to_ns(q->target), us_to_ns(q->interval));
2433 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2434 		      us_to_ns(q->target), us_to_ns(q->interval));
2435 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2436 		      us_to_ns(q->target), us_to_ns(q->interval));
2437 
2438 	/* bandwidth-sharing weights */
2439 	q->tins[0].tin_quantum = quantum;
2440 	q->tins[1].tin_quantum = quantum >> 4;
2441 	q->tins[2].tin_quantum = quantum >> 2;
2442 
2443 	return 0;
2444 }
2445 
2446 static void cake_reconfigure(struct Qdisc *sch)
2447 {
2448 	struct cake_sched_data *q = qdisc_priv(sch);
2449 	int c, ft;
2450 
2451 	switch (q->tin_mode) {
2452 	case CAKE_DIFFSERV_BESTEFFORT:
2453 		ft = cake_config_besteffort(sch);
2454 		break;
2455 
2456 	case CAKE_DIFFSERV_PRECEDENCE:
2457 		ft = cake_config_precedence(sch);
2458 		break;
2459 
2460 	case CAKE_DIFFSERV_DIFFSERV8:
2461 		ft = cake_config_diffserv8(sch);
2462 		break;
2463 
2464 	case CAKE_DIFFSERV_DIFFSERV4:
2465 		ft = cake_config_diffserv4(sch);
2466 		break;
2467 
2468 	case CAKE_DIFFSERV_DIFFSERV3:
2469 	default:
2470 		ft = cake_config_diffserv3(sch);
2471 		break;
2472 	}
2473 
2474 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2475 		cake_clear_tin(sch, c);
2476 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2477 	}
2478 
2479 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2480 	q->rate_shft = q->tins[ft].tin_rate_shft;
2481 
2482 	if (q->buffer_config_limit) {
2483 		q->buffer_limit = q->buffer_config_limit;
2484 	} else if (q->rate_bps) {
2485 		u64 t = q->rate_bps * q->interval;
2486 
2487 		do_div(t, USEC_PER_SEC / 4);
2488 		q->buffer_limit = max_t(u32, t, 4U << 20);
2489 	} else {
2490 		q->buffer_limit = ~0;
2491 	}
2492 
2493 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2494 
2495 	q->buffer_limit = min(q->buffer_limit,
2496 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2497 				  q->buffer_config_limit));
2498 }
2499 
2500 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2501 		       struct netlink_ext_ack *extack)
2502 {
2503 	struct cake_sched_data *q = qdisc_priv(sch);
2504 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2505 	int err;
2506 
2507 	if (!opt)
2508 		return -EINVAL;
2509 
2510 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2511 					  extack);
2512 	if (err < 0)
2513 		return err;
2514 
2515 	if (tb[TCA_CAKE_NAT]) {
2516 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2517 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2518 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2519 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2520 #else
2521 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2522 				    "No conntrack support in kernel");
2523 		return -EOPNOTSUPP;
2524 #endif
2525 	}
2526 
2527 	if (tb[TCA_CAKE_BASE_RATE64])
2528 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2529 
2530 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2531 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2532 
2533 	if (tb[TCA_CAKE_WASH]) {
2534 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2535 			q->rate_flags |= CAKE_FLAG_WASH;
2536 		else
2537 			q->rate_flags &= ~CAKE_FLAG_WASH;
2538 	}
2539 
2540 	if (tb[TCA_CAKE_FLOW_MODE])
2541 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2542 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2543 					CAKE_FLOW_MASK));
2544 
2545 	if (tb[TCA_CAKE_ATM])
2546 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2547 
2548 	if (tb[TCA_CAKE_OVERHEAD]) {
2549 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2550 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2551 
2552 		q->max_netlen = 0;
2553 		q->max_adjlen = 0;
2554 		q->min_netlen = ~0;
2555 		q->min_adjlen = ~0;
2556 	}
2557 
2558 	if (tb[TCA_CAKE_RAW]) {
2559 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2560 
2561 		q->max_netlen = 0;
2562 		q->max_adjlen = 0;
2563 		q->min_netlen = ~0;
2564 		q->min_adjlen = ~0;
2565 	}
2566 
2567 	if (tb[TCA_CAKE_MPU])
2568 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2569 
2570 	if (tb[TCA_CAKE_RTT]) {
2571 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2572 
2573 		if (!q->interval)
2574 			q->interval = 1;
2575 	}
2576 
2577 	if (tb[TCA_CAKE_TARGET]) {
2578 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2579 
2580 		if (!q->target)
2581 			q->target = 1;
2582 	}
2583 
2584 	if (tb[TCA_CAKE_AUTORATE]) {
2585 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2586 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2587 		else
2588 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2589 	}
2590 
2591 	if (tb[TCA_CAKE_INGRESS]) {
2592 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2593 			q->rate_flags |= CAKE_FLAG_INGRESS;
2594 		else
2595 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2596 	}
2597 
2598 	if (tb[TCA_CAKE_ACK_FILTER])
2599 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2600 
2601 	if (tb[TCA_CAKE_MEMORY])
2602 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2603 
2604 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2605 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2606 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2607 		else
2608 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2609 	}
2610 
2611 	if (tb[TCA_CAKE_FWMARK]) {
2612 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2613 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2614 	}
2615 
2616 	if (q->tins) {
2617 		sch_tree_lock(sch);
2618 		cake_reconfigure(sch);
2619 		sch_tree_unlock(sch);
2620 	}
2621 
2622 	return 0;
2623 }
2624 
2625 static void cake_destroy(struct Qdisc *sch)
2626 {
2627 	struct cake_sched_data *q = qdisc_priv(sch);
2628 
2629 	qdisc_watchdog_cancel(&q->watchdog);
2630 	tcf_block_put(q->block);
2631 	kvfree(q->tins);
2632 }
2633 
2634 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2635 		     struct netlink_ext_ack *extack)
2636 {
2637 	struct cake_sched_data *q = qdisc_priv(sch);
2638 	int i, j, err;
2639 
2640 	sch->limit = 10240;
2641 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2642 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2643 
2644 	q->rate_bps = 0; /* unlimited by default */
2645 
2646 	q->interval = 100000; /* 100ms default */
2647 	q->target   =   5000; /* 5ms: codel RFC argues
2648 			       * for 5 to 10% of interval
2649 			       */
2650 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2651 	q->cur_tin = 0;
2652 	q->cur_flow  = 0;
2653 
2654 	qdisc_watchdog_init(&q->watchdog, sch);
2655 
2656 	if (opt) {
2657 		int err = cake_change(sch, opt, extack);
2658 
2659 		if (err)
2660 			return err;
2661 	}
2662 
2663 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2664 	if (err)
2665 		return err;
2666 
2667 	quantum_div[0] = ~0;
2668 	for (i = 1; i <= CAKE_QUEUES; i++)
2669 		quantum_div[i] = 65535 / i;
2670 
2671 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2672 			   GFP_KERNEL);
2673 	if (!q->tins)
2674 		goto nomem;
2675 
2676 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2677 		struct cake_tin_data *b = q->tins + i;
2678 
2679 		INIT_LIST_HEAD(&b->new_flows);
2680 		INIT_LIST_HEAD(&b->old_flows);
2681 		INIT_LIST_HEAD(&b->decaying_flows);
2682 		b->sparse_flow_count = 0;
2683 		b->bulk_flow_count = 0;
2684 		b->decaying_flow_count = 0;
2685 
2686 		for (j = 0; j < CAKE_QUEUES; j++) {
2687 			struct cake_flow *flow = b->flows + j;
2688 			u32 k = j * CAKE_MAX_TINS + i;
2689 
2690 			INIT_LIST_HEAD(&flow->flowchain);
2691 			cobalt_vars_init(&flow->cvars);
2692 
2693 			q->overflow_heap[k].t = i;
2694 			q->overflow_heap[k].b = j;
2695 			b->overflow_idx[j] = k;
2696 		}
2697 	}
2698 
2699 	cake_reconfigure(sch);
2700 	q->avg_peak_bandwidth = q->rate_bps;
2701 	q->min_netlen = ~0;
2702 	q->min_adjlen = ~0;
2703 	return 0;
2704 
2705 nomem:
2706 	cake_destroy(sch);
2707 	return -ENOMEM;
2708 }
2709 
2710 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2711 {
2712 	struct cake_sched_data *q = qdisc_priv(sch);
2713 	struct nlattr *opts;
2714 
2715 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2716 	if (!opts)
2717 		goto nla_put_failure;
2718 
2719 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2720 			      TCA_CAKE_PAD))
2721 		goto nla_put_failure;
2722 
2723 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2724 			q->flow_mode & CAKE_FLOW_MASK))
2725 		goto nla_put_failure;
2726 
2727 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2728 		goto nla_put_failure;
2729 
2730 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2731 		goto nla_put_failure;
2732 
2733 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2734 		goto nla_put_failure;
2735 
2736 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2737 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2738 		goto nla_put_failure;
2739 
2740 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2741 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2742 		goto nla_put_failure;
2743 
2744 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2745 		goto nla_put_failure;
2746 
2747 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2748 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2749 		goto nla_put_failure;
2750 
2751 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2752 		goto nla_put_failure;
2753 
2754 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2755 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2756 		goto nla_put_failure;
2757 
2758 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2759 		goto nla_put_failure;
2760 
2761 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2762 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2763 			goto nla_put_failure;
2764 
2765 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2766 		goto nla_put_failure;
2767 
2768 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2769 		goto nla_put_failure;
2770 
2771 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2772 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2773 		goto nla_put_failure;
2774 
2775 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2776 		goto nla_put_failure;
2777 
2778 	return nla_nest_end(skb, opts);
2779 
2780 nla_put_failure:
2781 	return -1;
2782 }
2783 
2784 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2785 {
2786 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2787 	struct cake_sched_data *q = qdisc_priv(sch);
2788 	struct nlattr *tstats, *ts;
2789 	int i;
2790 
2791 	if (!stats)
2792 		return -1;
2793 
2794 #define PUT_STAT_U32(attr, data) do {				       \
2795 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2796 			goto nla_put_failure;			       \
2797 	} while (0)
2798 #define PUT_STAT_U64(attr, data) do {				       \
2799 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2800 					data, TCA_CAKE_STATS_PAD)) \
2801 			goto nla_put_failure;			       \
2802 	} while (0)
2803 
2804 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2805 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2806 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2807 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2808 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2809 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2810 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2811 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2812 
2813 #undef PUT_STAT_U32
2814 #undef PUT_STAT_U64
2815 
2816 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2817 	if (!tstats)
2818 		goto nla_put_failure;
2819 
2820 #define PUT_TSTAT_U32(attr, data) do {					\
2821 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2822 			goto nla_put_failure;				\
2823 	} while (0)
2824 #define PUT_TSTAT_U64(attr, data) do {					\
2825 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2826 					data, TCA_CAKE_TIN_STATS_PAD))	\
2827 			goto nla_put_failure;				\
2828 	} while (0)
2829 
2830 	for (i = 0; i < q->tin_cnt; i++) {
2831 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2832 
2833 		ts = nla_nest_start_noflag(d->skb, i + 1);
2834 		if (!ts)
2835 			goto nla_put_failure;
2836 
2837 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2838 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2839 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2840 
2841 		PUT_TSTAT_U32(TARGET_US,
2842 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2843 		PUT_TSTAT_U32(INTERVAL_US,
2844 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2845 
2846 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2847 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2848 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2849 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2850 
2851 		PUT_TSTAT_U32(PEAK_DELAY_US,
2852 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2853 		PUT_TSTAT_U32(AVG_DELAY_US,
2854 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2855 		PUT_TSTAT_U32(BASE_DELAY_US,
2856 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2857 
2858 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2859 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2860 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2861 
2862 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2863 					    b->decaying_flow_count);
2864 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2865 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2866 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2867 
2868 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2869 		nla_nest_end(d->skb, ts);
2870 	}
2871 
2872 #undef PUT_TSTAT_U32
2873 #undef PUT_TSTAT_U64
2874 
2875 	nla_nest_end(d->skb, tstats);
2876 	return nla_nest_end(d->skb, stats);
2877 
2878 nla_put_failure:
2879 	nla_nest_cancel(d->skb, stats);
2880 	return -1;
2881 }
2882 
2883 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2884 {
2885 	return NULL;
2886 }
2887 
2888 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2889 {
2890 	return 0;
2891 }
2892 
2893 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2894 			       u32 classid)
2895 {
2896 	return 0;
2897 }
2898 
2899 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2900 {
2901 }
2902 
2903 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2904 					struct netlink_ext_ack *extack)
2905 {
2906 	struct cake_sched_data *q = qdisc_priv(sch);
2907 
2908 	if (cl)
2909 		return NULL;
2910 	return q->block;
2911 }
2912 
2913 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2914 			   struct sk_buff *skb, struct tcmsg *tcm)
2915 {
2916 	tcm->tcm_handle |= TC_H_MIN(cl);
2917 	return 0;
2918 }
2919 
2920 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2921 				 struct gnet_dump *d)
2922 {
2923 	struct cake_sched_data *q = qdisc_priv(sch);
2924 	const struct cake_flow *flow = NULL;
2925 	struct gnet_stats_queue qs = { 0 };
2926 	struct nlattr *stats;
2927 	u32 idx = cl - 1;
2928 
2929 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2930 		const struct cake_tin_data *b = \
2931 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2932 		const struct sk_buff *skb;
2933 
2934 		flow = &b->flows[idx % CAKE_QUEUES];
2935 
2936 		if (flow->head) {
2937 			sch_tree_lock(sch);
2938 			skb = flow->head;
2939 			while (skb) {
2940 				qs.qlen++;
2941 				skb = skb->next;
2942 			}
2943 			sch_tree_unlock(sch);
2944 		}
2945 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2946 		qs.drops = flow->dropped;
2947 	}
2948 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2949 		return -1;
2950 	if (flow) {
2951 		ktime_t now = ktime_get();
2952 
2953 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2954 		if (!stats)
2955 			return -1;
2956 
2957 #define PUT_STAT_U32(attr, data) do {				       \
2958 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2959 			goto nla_put_failure;			       \
2960 	} while (0)
2961 #define PUT_STAT_S32(attr, data) do {				       \
2962 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2963 			goto nla_put_failure;			       \
2964 	} while (0)
2965 
2966 		PUT_STAT_S32(DEFICIT, flow->deficit);
2967 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2968 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2969 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2970 		if (flow->cvars.p_drop) {
2971 			PUT_STAT_S32(BLUE_TIMER_US,
2972 				     ktime_to_us(
2973 					     ktime_sub(now,
2974 						     flow->cvars.blue_timer)));
2975 		}
2976 		if (flow->cvars.dropping) {
2977 			PUT_STAT_S32(DROP_NEXT_US,
2978 				     ktime_to_us(
2979 					     ktime_sub(now,
2980 						       flow->cvars.drop_next)));
2981 		}
2982 
2983 		if (nla_nest_end(d->skb, stats) < 0)
2984 			return -1;
2985 	}
2986 
2987 	return 0;
2988 
2989 nla_put_failure:
2990 	nla_nest_cancel(d->skb, stats);
2991 	return -1;
2992 }
2993 
2994 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2995 {
2996 	struct cake_sched_data *q = qdisc_priv(sch);
2997 	unsigned int i, j;
2998 
2999 	if (arg->stop)
3000 		return;
3001 
3002 	for (i = 0; i < q->tin_cnt; i++) {
3003 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3004 
3005 		for (j = 0; j < CAKE_QUEUES; j++) {
3006 			if (list_empty(&b->flows[j].flowchain) ||
3007 			    arg->count < arg->skip) {
3008 				arg->count++;
3009 				continue;
3010 			}
3011 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3012 				arg->stop = 1;
3013 				break;
3014 			}
3015 			arg->count++;
3016 		}
3017 	}
3018 }
3019 
3020 static const struct Qdisc_class_ops cake_class_ops = {
3021 	.leaf		=	cake_leaf,
3022 	.find		=	cake_find,
3023 	.tcf_block	=	cake_tcf_block,
3024 	.bind_tcf	=	cake_bind,
3025 	.unbind_tcf	=	cake_unbind,
3026 	.dump		=	cake_dump_class,
3027 	.dump_stats	=	cake_dump_class_stats,
3028 	.walk		=	cake_walk,
3029 };
3030 
3031 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3032 	.cl_ops		=	&cake_class_ops,
3033 	.id		=	"cake",
3034 	.priv_size	=	sizeof(struct cake_sched_data),
3035 	.enqueue	=	cake_enqueue,
3036 	.dequeue	=	cake_dequeue,
3037 	.peek		=	qdisc_peek_dequeued,
3038 	.init		=	cake_init,
3039 	.reset		=	cake_reset,
3040 	.destroy	=	cake_destroy,
3041 	.change		=	cake_change,
3042 	.dump		=	cake_dump,
3043 	.dump_stats	=	cake_dump_stats,
3044 	.owner		=	THIS_MODULE,
3045 };
3046 
3047 static int __init cake_module_init(void)
3048 {
3049 	return register_qdisc(&cake_qdisc_ops);
3050 }
3051 
3052 static void __exit cake_module_exit(void)
3053 {
3054 	unregister_qdisc(&cake_qdisc_ops);
3055 }
3056 
3057 module_init(cake_module_init)
3058 module_exit(cake_module_exit)
3059 MODULE_AUTHOR("Jonathan Morton");
3060 MODULE_LICENSE("Dual BSD/GPL");
3061 MODULE_DESCRIPTION("The CAKE shaper.");
3062