xref: /openbmc/linux/net/sched/sch_cake.c (revision 176f011b)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_refcnt;
142 	u16 dsthost_refcnt;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum_prio;
177 	u16	tin_quantum_band;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
215 	u16		rate_shft;
216 	ktime_t		time_next_packet;
217 	ktime_t		failsafe_next_packet;
218 	u64		rate_ns;
219 	u64		rate_bps;
220 	u16		rate_flags;
221 	s16		rate_overhead;
222 	u16		rate_mpu;
223 	u64		interval;
224 	u64		target;
225 
226 	/* resource tracking */
227 	u32		buffer_used;
228 	u32		buffer_max_used;
229 	u32		buffer_limit;
230 	u32		buffer_config_limit;
231 
232 	/* indices for dequeue */
233 	u16		cur_tin;
234 	u16		cur_flow;
235 
236 	struct qdisc_watchdog watchdog;
237 	const u8	*tin_index;
238 	const u8	*tin_order;
239 
240 	/* bandwidth capacity estimate */
241 	ktime_t		last_packet_time;
242 	ktime_t		avg_window_begin;
243 	u64		avg_packet_interval;
244 	u64		avg_window_bytes;
245 	u64		avg_peak_bandwidth;
246 	ktime_t		last_reconfig_time;
247 
248 	/* packet length stats */
249 	u32		avg_netoff;
250 	u16		max_netlen;
251 	u16		max_adjlen;
252 	u16		min_netlen;
253 	u16		min_adjlen;
254 };
255 
256 enum {
257 	CAKE_FLAG_OVERHEAD	   = BIT(0),
258 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
259 	CAKE_FLAG_INGRESS	   = BIT(2),
260 	CAKE_FLAG_WASH		   = BIT(3),
261 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
262 };
263 
264 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
265  * obtain the best features of each.  Codel is excellent on flows which
266  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
267  * unresponsive flows.
268  */
269 
270 struct cobalt_skb_cb {
271 	ktime_t enqueue_time;
272 	u32     adjusted_len;
273 };
274 
275 static u64 us_to_ns(u64 us)
276 {
277 	return us * NSEC_PER_USEC;
278 }
279 
280 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
281 {
282 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
283 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
284 }
285 
286 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
287 {
288 	return get_cobalt_cb(skb)->enqueue_time;
289 }
290 
291 static void cobalt_set_enqueue_time(struct sk_buff *skb,
292 				    ktime_t now)
293 {
294 	get_cobalt_cb(skb)->enqueue_time = now;
295 }
296 
297 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
298 
299 /* Diffserv lookup tables */
300 
301 static const u8 precedence[] = {
302 	0, 0, 0, 0, 0, 0, 0, 0,
303 	1, 1, 1, 1, 1, 1, 1, 1,
304 	2, 2, 2, 2, 2, 2, 2, 2,
305 	3, 3, 3, 3, 3, 3, 3, 3,
306 	4, 4, 4, 4, 4, 4, 4, 4,
307 	5, 5, 5, 5, 5, 5, 5, 5,
308 	6, 6, 6, 6, 6, 6, 6, 6,
309 	7, 7, 7, 7, 7, 7, 7, 7,
310 };
311 
312 static const u8 diffserv8[] = {
313 	2, 5, 1, 2, 4, 2, 2, 2,
314 	0, 2, 1, 2, 1, 2, 1, 2,
315 	5, 2, 4, 2, 4, 2, 4, 2,
316 	3, 2, 3, 2, 3, 2, 3, 2,
317 	6, 2, 3, 2, 3, 2, 3, 2,
318 	6, 2, 2, 2, 6, 2, 6, 2,
319 	7, 2, 2, 2, 2, 2, 2, 2,
320 	7, 2, 2, 2, 2, 2, 2, 2,
321 };
322 
323 static const u8 diffserv4[] = {
324 	0, 2, 0, 0, 2, 0, 0, 0,
325 	1, 0, 0, 0, 0, 0, 0, 0,
326 	2, 0, 2, 0, 2, 0, 2, 0,
327 	2, 0, 2, 0, 2, 0, 2, 0,
328 	3, 0, 2, 0, 2, 0, 2, 0,
329 	3, 0, 0, 0, 3, 0, 3, 0,
330 	3, 0, 0, 0, 0, 0, 0, 0,
331 	3, 0, 0, 0, 0, 0, 0, 0,
332 };
333 
334 static const u8 diffserv3[] = {
335 	0, 0, 0, 0, 2, 0, 0, 0,
336 	1, 0, 0, 0, 0, 0, 0, 0,
337 	0, 0, 0, 0, 0, 0, 0, 0,
338 	0, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 2, 0, 2, 0,
341 	2, 0, 0, 0, 0, 0, 0, 0,
342 	2, 0, 0, 0, 0, 0, 0, 0,
343 };
344 
345 static const u8 besteffort[] = {
346 	0, 0, 0, 0, 0, 0, 0, 0,
347 	0, 0, 0, 0, 0, 0, 0, 0,
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 };
355 
356 /* tin priority order for stats dumping */
357 
358 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
359 static const u8 bulk_order[] = {1, 0, 2, 3};
360 
361 #define REC_INV_SQRT_CACHE (16)
362 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
363 
364 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
365  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
366  *
367  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
368  */
369 
370 static void cobalt_newton_step(struct cobalt_vars *vars)
371 {
372 	u32 invsqrt, invsqrt2;
373 	u64 val;
374 
375 	invsqrt = vars->rec_inv_sqrt;
376 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
377 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
378 
379 	val >>= 2; /* avoid overflow in following multiply */
380 	val = (val * invsqrt) >> (32 - 2 + 1);
381 
382 	vars->rec_inv_sqrt = val;
383 }
384 
385 static void cobalt_invsqrt(struct cobalt_vars *vars)
386 {
387 	if (vars->count < REC_INV_SQRT_CACHE)
388 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
389 	else
390 		cobalt_newton_step(vars);
391 }
392 
393 /* There is a big difference in timing between the accurate values placed in
394  * the cache and the approximations given by a single Newton step for small
395  * count values, particularly when stepping from count 1 to 2 or vice versa.
396  * Above 16, a single Newton step gives sufficient accuracy in either
397  * direction, given the precision stored.
398  *
399  * The magnitude of the error when stepping up to count 2 is such as to give
400  * the value that *should* have been produced at count 4.
401  */
402 
403 static void cobalt_cache_init(void)
404 {
405 	struct cobalt_vars v;
406 
407 	memset(&v, 0, sizeof(v));
408 	v.rec_inv_sqrt = ~0U;
409 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
410 
411 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
412 		cobalt_newton_step(&v);
413 		cobalt_newton_step(&v);
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 
417 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
418 	}
419 }
420 
421 static void cobalt_vars_init(struct cobalt_vars *vars)
422 {
423 	memset(vars, 0, sizeof(*vars));
424 
425 	if (!cobalt_rec_inv_sqrt_cache[0]) {
426 		cobalt_cache_init();
427 		cobalt_rec_inv_sqrt_cache[0] = ~0;
428 	}
429 }
430 
431 /* CoDel control_law is t + interval/sqrt(count)
432  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
433  * both sqrt() and divide operation.
434  */
435 static ktime_t cobalt_control(ktime_t t,
436 			      u64 interval,
437 			      u32 rec_inv_sqrt)
438 {
439 	return ktime_add_ns(t, reciprocal_scale(interval,
440 						rec_inv_sqrt));
441 }
442 
443 /* Call this when a packet had to be dropped due to queue overflow.  Returns
444  * true if the BLUE state was quiescent before but active after this call.
445  */
446 static bool cobalt_queue_full(struct cobalt_vars *vars,
447 			      struct cobalt_params *p,
448 			      ktime_t now)
449 {
450 	bool up = false;
451 
452 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
453 		up = !vars->p_drop;
454 		vars->p_drop += p->p_inc;
455 		if (vars->p_drop < p->p_inc)
456 			vars->p_drop = ~0;
457 		vars->blue_timer = now;
458 	}
459 	vars->dropping = true;
460 	vars->drop_next = now;
461 	if (!vars->count)
462 		vars->count = 1;
463 
464 	return up;
465 }
466 
467 /* Call this when the queue was serviced but turned out to be empty.  Returns
468  * true if the BLUE state was active before but quiescent after this call.
469  */
470 static bool cobalt_queue_empty(struct cobalt_vars *vars,
471 			       struct cobalt_params *p,
472 			       ktime_t now)
473 {
474 	bool down = false;
475 
476 	if (vars->p_drop &&
477 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
478 		if (vars->p_drop < p->p_dec)
479 			vars->p_drop = 0;
480 		else
481 			vars->p_drop -= p->p_dec;
482 		vars->blue_timer = now;
483 		down = !vars->p_drop;
484 	}
485 	vars->dropping = false;
486 
487 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
488 		vars->count--;
489 		cobalt_invsqrt(vars);
490 		vars->drop_next = cobalt_control(vars->drop_next,
491 						 p->interval,
492 						 vars->rec_inv_sqrt);
493 	}
494 
495 	return down;
496 }
497 
498 /* Call this with a freshly dequeued packet for possible congestion marking.
499  * Returns true as an instruction to drop the packet, false for delivery.
500  */
501 static bool cobalt_should_drop(struct cobalt_vars *vars,
502 			       struct cobalt_params *p,
503 			       ktime_t now,
504 			       struct sk_buff *skb,
505 			       u32 bulk_flows)
506 {
507 	bool next_due, over_target, drop = false;
508 	ktime_t schedule;
509 	u64 sojourn;
510 
511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
512  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
513  * scheduling decision is actually branched, without destroying that
514  * information.  Similarly, the first 'schedule' value calculated is preserved
515  * in the boolean 'next_due'.
516  *
517  * As for 'drop_next', we take advantage of the fact that 'interval' is both
518  * the delay between first exceeding 'target' and the first signalling event,
519  * *and* the scaling factor for the signalling frequency.  It's therefore very
520  * natural to use a single mechanism for both purposes, and eliminates a
521  * significant amount of reference Codel's spaghetti code.  To help with this,
522  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523  * as possible to 1.0 in fixed-point.
524  */
525 
526 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527 	schedule = ktime_sub(now, vars->drop_next);
528 	over_target = sojourn > p->target &&
529 		      sojourn > p->mtu_time * bulk_flows * 2 &&
530 		      sojourn > p->mtu_time * 4;
531 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
532 
533 	vars->ecn_marked = false;
534 
535 	if (over_target) {
536 		if (!vars->dropping) {
537 			vars->dropping = true;
538 			vars->drop_next = cobalt_control(now,
539 							 p->interval,
540 							 vars->rec_inv_sqrt);
541 		}
542 		if (!vars->count)
543 			vars->count = 1;
544 	} else if (vars->dropping) {
545 		vars->dropping = false;
546 	}
547 
548 	if (next_due && vars->dropping) {
549 		/* Use ECN mark if possible, otherwise drop */
550 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
551 
552 		vars->count++;
553 		if (!vars->count)
554 			vars->count--;
555 		cobalt_invsqrt(vars);
556 		vars->drop_next = cobalt_control(vars->drop_next,
557 						 p->interval,
558 						 vars->rec_inv_sqrt);
559 		schedule = ktime_sub(now, vars->drop_next);
560 	} else {
561 		while (next_due) {
562 			vars->count--;
563 			cobalt_invsqrt(vars);
564 			vars->drop_next = cobalt_control(vars->drop_next,
565 							 p->interval,
566 							 vars->rec_inv_sqrt);
567 			schedule = ktime_sub(now, vars->drop_next);
568 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
569 		}
570 	}
571 
572 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
573 	if (vars->p_drop)
574 		drop |= (prandom_u32() < vars->p_drop);
575 
576 	/* Overload the drop_next field as an activity timeout */
577 	if (!vars->count)
578 		vars->drop_next = ktime_add_ns(now, p->interval);
579 	else if (ktime_to_ns(schedule) > 0 && !drop)
580 		vars->drop_next = now;
581 
582 	return drop;
583 }
584 
585 static void cake_update_flowkeys(struct flow_keys *keys,
586 				 const struct sk_buff *skb)
587 {
588 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
589 	struct nf_conntrack_tuple tuple = {};
590 	bool rev = !skb->_nfct;
591 
592 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
593 		return;
594 
595 	if (!nf_ct_get_tuple_skb(&tuple, skb))
596 		return;
597 
598 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
599 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
600 
601 	if (keys->ports.ports) {
602 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
603 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
604 	}
605 #endif
606 }
607 
608 /* Cake has several subtle multiple bit settings. In these cases you
609  *  would be matching triple isolate mode as well.
610  */
611 
612 static bool cake_dsrc(int flow_mode)
613 {
614 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
615 }
616 
617 static bool cake_ddst(int flow_mode)
618 {
619 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
620 }
621 
622 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
623 		     int flow_mode, u16 flow_override, u16 host_override)
624 {
625 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
626 	u16 reduced_hash, srchost_idx, dsthost_idx;
627 	struct flow_keys keys, host_keys;
628 
629 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
630 		return 0;
631 
632 	/* If both overrides are set we can skip packet dissection entirely */
633 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
634 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
635 		goto skip_hash;
636 
637 	skb_flow_dissect_flow_keys(skb, &keys,
638 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
639 
640 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
641 		cake_update_flowkeys(&keys, skb);
642 
643 	/* flow_hash_from_keys() sorts the addresses by value, so we have
644 	 * to preserve their order in a separate data structure to treat
645 	 * src and dst host addresses as independently selectable.
646 	 */
647 	host_keys = keys;
648 	host_keys.ports.ports     = 0;
649 	host_keys.basic.ip_proto  = 0;
650 	host_keys.keyid.keyid     = 0;
651 	host_keys.tags.flow_label = 0;
652 
653 	switch (host_keys.control.addr_type) {
654 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
655 		host_keys.addrs.v4addrs.src = 0;
656 		dsthost_hash = flow_hash_from_keys(&host_keys);
657 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
658 		host_keys.addrs.v4addrs.dst = 0;
659 		srchost_hash = flow_hash_from_keys(&host_keys);
660 		break;
661 
662 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
663 		memset(&host_keys.addrs.v6addrs.src, 0,
664 		       sizeof(host_keys.addrs.v6addrs.src));
665 		dsthost_hash = flow_hash_from_keys(&host_keys);
666 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
667 		memset(&host_keys.addrs.v6addrs.dst, 0,
668 		       sizeof(host_keys.addrs.v6addrs.dst));
669 		srchost_hash = flow_hash_from_keys(&host_keys);
670 		break;
671 
672 	default:
673 		dsthost_hash = 0;
674 		srchost_hash = 0;
675 	}
676 
677 	/* This *must* be after the above switch, since as a
678 	 * side-effect it sorts the src and dst addresses.
679 	 */
680 	if (flow_mode & CAKE_FLOW_FLOWS)
681 		flow_hash = flow_hash_from_keys(&keys);
682 
683 skip_hash:
684 	if (flow_override)
685 		flow_hash = flow_override - 1;
686 	if (host_override) {
687 		dsthost_hash = host_override - 1;
688 		srchost_hash = host_override - 1;
689 	}
690 
691 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
692 		if (flow_mode & CAKE_FLOW_SRC_IP)
693 			flow_hash ^= srchost_hash;
694 
695 		if (flow_mode & CAKE_FLOW_DST_IP)
696 			flow_hash ^= dsthost_hash;
697 	}
698 
699 	reduced_hash = flow_hash % CAKE_QUEUES;
700 
701 	/* set-associative hashing */
702 	/* fast path if no hash collision (direct lookup succeeds) */
703 	if (likely(q->tags[reduced_hash] == flow_hash &&
704 		   q->flows[reduced_hash].set)) {
705 		q->way_directs++;
706 	} else {
707 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
708 		u32 outer_hash = reduced_hash - inner_hash;
709 		bool allocate_src = false;
710 		bool allocate_dst = false;
711 		u32 i, k;
712 
713 		/* check if any active queue in the set is reserved for
714 		 * this flow.
715 		 */
716 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
717 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
718 			if (q->tags[outer_hash + k] == flow_hash) {
719 				if (i)
720 					q->way_hits++;
721 
722 				if (!q->flows[outer_hash + k].set) {
723 					/* need to increment host refcnts */
724 					allocate_src = cake_dsrc(flow_mode);
725 					allocate_dst = cake_ddst(flow_mode);
726 				}
727 
728 				goto found;
729 			}
730 		}
731 
732 		/* no queue is reserved for this flow, look for an
733 		 * empty one.
734 		 */
735 		for (i = 0; i < CAKE_SET_WAYS;
736 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
737 			if (!q->flows[outer_hash + k].set) {
738 				q->way_misses++;
739 				allocate_src = cake_dsrc(flow_mode);
740 				allocate_dst = cake_ddst(flow_mode);
741 				goto found;
742 			}
743 		}
744 
745 		/* With no empty queues, default to the original
746 		 * queue, accept the collision, update the host tags.
747 		 */
748 		q->way_collisions++;
749 		q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
750 		q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
751 		allocate_src = cake_dsrc(flow_mode);
752 		allocate_dst = cake_ddst(flow_mode);
753 found:
754 		/* reserve queue for future packets in same flow */
755 		reduced_hash = outer_hash + k;
756 		q->tags[reduced_hash] = flow_hash;
757 
758 		if (allocate_src) {
759 			srchost_idx = srchost_hash % CAKE_QUEUES;
760 			inner_hash = srchost_idx % CAKE_SET_WAYS;
761 			outer_hash = srchost_idx - inner_hash;
762 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
763 				i++, k = (k + 1) % CAKE_SET_WAYS) {
764 				if (q->hosts[outer_hash + k].srchost_tag ==
765 				    srchost_hash)
766 					goto found_src;
767 			}
768 			for (i = 0; i < CAKE_SET_WAYS;
769 				i++, k = (k + 1) % CAKE_SET_WAYS) {
770 				if (!q->hosts[outer_hash + k].srchost_refcnt)
771 					break;
772 			}
773 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
774 found_src:
775 			srchost_idx = outer_hash + k;
776 			q->hosts[srchost_idx].srchost_refcnt++;
777 			q->flows[reduced_hash].srchost = srchost_idx;
778 		}
779 
780 		if (allocate_dst) {
781 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
782 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
783 			outer_hash = dsthost_idx - inner_hash;
784 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
785 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
786 				if (q->hosts[outer_hash + k].dsthost_tag ==
787 				    dsthost_hash)
788 					goto found_dst;
789 			}
790 			for (i = 0; i < CAKE_SET_WAYS;
791 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
792 				if (!q->hosts[outer_hash + k].dsthost_refcnt)
793 					break;
794 			}
795 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
796 found_dst:
797 			dsthost_idx = outer_hash + k;
798 			q->hosts[dsthost_idx].dsthost_refcnt++;
799 			q->flows[reduced_hash].dsthost = dsthost_idx;
800 		}
801 	}
802 
803 	return reduced_hash;
804 }
805 
806 /* helper functions : might be changed when/if skb use a standard list_head */
807 /* remove one skb from head of slot queue */
808 
809 static struct sk_buff *dequeue_head(struct cake_flow *flow)
810 {
811 	struct sk_buff *skb = flow->head;
812 
813 	if (skb) {
814 		flow->head = skb->next;
815 		skb_mark_not_on_list(skb);
816 	}
817 
818 	return skb;
819 }
820 
821 /* add skb to flow queue (tail add) */
822 
823 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
824 {
825 	if (!flow->head)
826 		flow->head = skb;
827 	else
828 		flow->tail->next = skb;
829 	flow->tail = skb;
830 	skb->next = NULL;
831 }
832 
833 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
834 				    struct ipv6hdr *buf)
835 {
836 	unsigned int offset = skb_network_offset(skb);
837 	struct iphdr *iph;
838 
839 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
840 
841 	if (!iph)
842 		return NULL;
843 
844 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
845 		return skb_header_pointer(skb, offset + iph->ihl * 4,
846 					  sizeof(struct ipv6hdr), buf);
847 
848 	else if (iph->version == 4)
849 		return iph;
850 
851 	else if (iph->version == 6)
852 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
853 					  buf);
854 
855 	return NULL;
856 }
857 
858 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
859 				      void *buf, unsigned int bufsize)
860 {
861 	unsigned int offset = skb_network_offset(skb);
862 	const struct ipv6hdr *ipv6h;
863 	const struct tcphdr *tcph;
864 	const struct iphdr *iph;
865 	struct ipv6hdr _ipv6h;
866 	struct tcphdr _tcph;
867 
868 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
869 
870 	if (!ipv6h)
871 		return NULL;
872 
873 	if (ipv6h->version == 4) {
874 		iph = (struct iphdr *)ipv6h;
875 		offset += iph->ihl * 4;
876 
877 		/* special-case 6in4 tunnelling, as that is a common way to get
878 		 * v6 connectivity in the home
879 		 */
880 		if (iph->protocol == IPPROTO_IPV6) {
881 			ipv6h = skb_header_pointer(skb, offset,
882 						   sizeof(_ipv6h), &_ipv6h);
883 
884 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
885 				return NULL;
886 
887 			offset += sizeof(struct ipv6hdr);
888 
889 		} else if (iph->protocol != IPPROTO_TCP) {
890 			return NULL;
891 		}
892 
893 	} else if (ipv6h->version == 6) {
894 		if (ipv6h->nexthdr != IPPROTO_TCP)
895 			return NULL;
896 
897 		offset += sizeof(struct ipv6hdr);
898 	} else {
899 		return NULL;
900 	}
901 
902 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
903 	if (!tcph)
904 		return NULL;
905 
906 	return skb_header_pointer(skb, offset,
907 				  min(__tcp_hdrlen(tcph), bufsize), buf);
908 }
909 
910 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
911 				   int code, int *oplen)
912 {
913 	/* inspired by tcp_parse_options in tcp_input.c */
914 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
915 	const u8 *ptr = (const u8 *)(tcph + 1);
916 
917 	while (length > 0) {
918 		int opcode = *ptr++;
919 		int opsize;
920 
921 		if (opcode == TCPOPT_EOL)
922 			break;
923 		if (opcode == TCPOPT_NOP) {
924 			length--;
925 			continue;
926 		}
927 		opsize = *ptr++;
928 		if (opsize < 2 || opsize > length)
929 			break;
930 
931 		if (opcode == code) {
932 			*oplen = opsize;
933 			return ptr;
934 		}
935 
936 		ptr += opsize - 2;
937 		length -= opsize;
938 	}
939 
940 	return NULL;
941 }
942 
943 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
944  * bytes than the other. In the case where both sequences ACKs bytes that the
945  * other doesn't, A is considered greater. DSACKs in A also makes A be
946  * considered greater.
947  *
948  * @return -1, 0 or 1 as normal compare functions
949  */
950 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
951 				  const struct tcphdr *tcph_b)
952 {
953 	const struct tcp_sack_block_wire *sack_a, *sack_b;
954 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
955 	u32 bytes_a = 0, bytes_b = 0;
956 	int oplen_a, oplen_b;
957 	bool first = true;
958 
959 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
960 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
961 
962 	/* pointers point to option contents */
963 	oplen_a -= TCPOLEN_SACK_BASE;
964 	oplen_b -= TCPOLEN_SACK_BASE;
965 
966 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
967 	    (!sack_b || oplen_b < sizeof(*sack_b)))
968 		return -1;
969 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
970 		 (!sack_a || oplen_a < sizeof(*sack_a)))
971 		return 1;
972 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
973 		 (!sack_b || oplen_b < sizeof(*sack_b)))
974 		return 0;
975 
976 	while (oplen_a >= sizeof(*sack_a)) {
977 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
978 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
979 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
980 		int oplen_tmp = oplen_b;
981 		bool found = false;
982 
983 		/* DSACK; always considered greater to prevent dropping */
984 		if (before(start_a, ack_seq_a))
985 			return -1;
986 
987 		bytes_a += end_a - start_a;
988 
989 		while (oplen_tmp >= sizeof(*sack_tmp)) {
990 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
991 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
992 
993 			/* first time through we count the total size */
994 			if (first)
995 				bytes_b += end_b - start_b;
996 
997 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
998 				found = true;
999 				if (!first)
1000 					break;
1001 			}
1002 			oplen_tmp -= sizeof(*sack_tmp);
1003 			sack_tmp++;
1004 		}
1005 
1006 		if (!found)
1007 			return -1;
1008 
1009 		oplen_a -= sizeof(*sack_a);
1010 		sack_a++;
1011 		first = false;
1012 	}
1013 
1014 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1015 	 * either the SACKs are equal, or B SACKs more bytes.
1016 	 */
1017 	return bytes_b > bytes_a ? 1 : 0;
1018 }
1019 
1020 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1021 				 u32 *tsval, u32 *tsecr)
1022 {
1023 	const u8 *ptr;
1024 	int opsize;
1025 
1026 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1027 
1028 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1029 		*tsval = get_unaligned_be32(ptr);
1030 		*tsecr = get_unaligned_be32(ptr + 4);
1031 	}
1032 }
1033 
1034 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1035 			       u32 tstamp_new, u32 tsecr_new)
1036 {
1037 	/* inspired by tcp_parse_options in tcp_input.c */
1038 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1039 	const u8 *ptr = (const u8 *)(tcph + 1);
1040 	u32 tstamp, tsecr;
1041 
1042 	/* 3 reserved flags must be unset to avoid future breakage
1043 	 * ACK must be set
1044 	 * ECE/CWR are handled separately
1045 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1046 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1047 	 * 0x00C00000 = CWR/ECE (handled separately)
1048 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1049 	 */
1050 	if (((tcp_flag_word(tcph) &
1051 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1052 		return false;
1053 
1054 	while (length > 0) {
1055 		int opcode = *ptr++;
1056 		int opsize;
1057 
1058 		if (opcode == TCPOPT_EOL)
1059 			break;
1060 		if (opcode == TCPOPT_NOP) {
1061 			length--;
1062 			continue;
1063 		}
1064 		opsize = *ptr++;
1065 		if (opsize < 2 || opsize > length)
1066 			break;
1067 
1068 		switch (opcode) {
1069 		case TCPOPT_MD5SIG: /* doesn't influence state */
1070 			break;
1071 
1072 		case TCPOPT_SACK: /* stricter checking performed later */
1073 			if (opsize % 8 != 2)
1074 				return false;
1075 			break;
1076 
1077 		case TCPOPT_TIMESTAMP:
1078 			/* only drop timestamps lower than new */
1079 			if (opsize != TCPOLEN_TIMESTAMP)
1080 				return false;
1081 			tstamp = get_unaligned_be32(ptr);
1082 			tsecr = get_unaligned_be32(ptr + 4);
1083 			if (after(tstamp, tstamp_new) ||
1084 			    after(tsecr, tsecr_new))
1085 				return false;
1086 			break;
1087 
1088 		case TCPOPT_MSS:  /* these should only be set on SYN */
1089 		case TCPOPT_WINDOW:
1090 		case TCPOPT_SACK_PERM:
1091 		case TCPOPT_FASTOPEN:
1092 		case TCPOPT_EXP:
1093 		default: /* don't drop if any unknown options are present */
1094 			return false;
1095 		}
1096 
1097 		ptr += opsize - 2;
1098 		length -= opsize;
1099 	}
1100 
1101 	return true;
1102 }
1103 
1104 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1105 				       struct cake_flow *flow)
1106 {
1107 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1108 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1109 	struct sk_buff *skb_check, *skb_prev = NULL;
1110 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1111 	unsigned char _tcph[64], _tcph_check[64];
1112 	const struct tcphdr *tcph, *tcph_check;
1113 	const struct iphdr *iph, *iph_check;
1114 	struct ipv6hdr _iph, _iph_check;
1115 	const struct sk_buff *skb;
1116 	int seglen, num_found = 0;
1117 	u32 tstamp = 0, tsecr = 0;
1118 	__be32 elig_flags = 0;
1119 	int sack_comp;
1120 
1121 	/* no other possible ACKs to filter */
1122 	if (flow->head == flow->tail)
1123 		return NULL;
1124 
1125 	skb = flow->tail;
1126 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1127 	iph = cake_get_iphdr(skb, &_iph);
1128 	if (!tcph)
1129 		return NULL;
1130 
1131 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1132 
1133 	/* the 'triggering' packet need only have the ACK flag set.
1134 	 * also check that SYN is not set, as there won't be any previous ACKs.
1135 	 */
1136 	if ((tcp_flag_word(tcph) &
1137 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1138 		return NULL;
1139 
1140 	/* the 'triggering' ACK is at the tail of the queue, we have already
1141 	 * returned if it is the only packet in the flow. loop through the rest
1142 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1143 	 * triggering one.
1144 	 */
1145 	for (skb_check = flow->head;
1146 	     skb_check && skb_check != skb;
1147 	     skb_prev = skb_check, skb_check = skb_check->next) {
1148 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1149 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1150 					     sizeof(_tcph_check));
1151 
1152 		/* only TCP packets with matching 5-tuple are eligible, and only
1153 		 * drop safe headers
1154 		 */
1155 		if (!tcph_check || iph->version != iph_check->version ||
1156 		    tcph_check->source != tcph->source ||
1157 		    tcph_check->dest != tcph->dest)
1158 			continue;
1159 
1160 		if (iph_check->version == 4) {
1161 			if (iph_check->saddr != iph->saddr ||
1162 			    iph_check->daddr != iph->daddr)
1163 				continue;
1164 
1165 			seglen = ntohs(iph_check->tot_len) -
1166 				       (4 * iph_check->ihl);
1167 		} else if (iph_check->version == 6) {
1168 			ipv6h = (struct ipv6hdr *)iph;
1169 			ipv6h_check = (struct ipv6hdr *)iph_check;
1170 
1171 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1172 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1173 				continue;
1174 
1175 			seglen = ntohs(ipv6h_check->payload_len);
1176 		} else {
1177 			WARN_ON(1);  /* shouldn't happen */
1178 			continue;
1179 		}
1180 
1181 		/* If the ECE/CWR flags changed from the previous eligible
1182 		 * packet in the same flow, we should no longer be dropping that
1183 		 * previous packet as this would lose information.
1184 		 */
1185 		if (elig_ack && (tcp_flag_word(tcph_check) &
1186 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1187 			elig_ack = NULL;
1188 			elig_ack_prev = NULL;
1189 			num_found--;
1190 		}
1191 
1192 		/* Check TCP options and flags, don't drop ACKs with segment
1193 		 * data, and don't drop ACKs with a higher cumulative ACK
1194 		 * counter than the triggering packet. Check ACK seqno here to
1195 		 * avoid parsing SACK options of packets we are going to exclude
1196 		 * anyway.
1197 		 */
1198 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1199 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1200 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1201 			continue;
1202 
1203 		/* Check SACK options. The triggering packet must SACK more data
1204 		 * than the ACK under consideration, or SACK the same range but
1205 		 * have a larger cumulative ACK counter. The latter is a
1206 		 * pathological case, but is contained in the following check
1207 		 * anyway, just to be safe.
1208 		 */
1209 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1210 
1211 		if (sack_comp < 0 ||
1212 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1213 		     sack_comp == 0))
1214 			continue;
1215 
1216 		/* At this point we have found an eligible pure ACK to drop; if
1217 		 * we are in aggressive mode, we are done. Otherwise, keep
1218 		 * searching unless this is the second eligible ACK we
1219 		 * found.
1220 		 *
1221 		 * Since we want to drop ACK closest to the head of the queue,
1222 		 * save the first eligible ACK we find, even if we need to loop
1223 		 * again.
1224 		 */
1225 		if (!elig_ack) {
1226 			elig_ack = skb_check;
1227 			elig_ack_prev = skb_prev;
1228 			elig_flags = (tcp_flag_word(tcph_check)
1229 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1230 		}
1231 
1232 		if (num_found++ > 0)
1233 			goto found;
1234 	}
1235 
1236 	/* We made it through the queue without finding two eligible ACKs . If
1237 	 * we found a single eligible ACK we can drop it in aggressive mode if
1238 	 * we can guarantee that this does not interfere with ECN flag
1239 	 * information. We ensure this by dropping it only if the enqueued
1240 	 * packet is consecutive with the eligible ACK, and their flags match.
1241 	 */
1242 	if (elig_ack && aggressive && elig_ack->next == skb &&
1243 	    (elig_flags == (tcp_flag_word(tcph) &
1244 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1245 		goto found;
1246 
1247 	return NULL;
1248 
1249 found:
1250 	if (elig_ack_prev)
1251 		elig_ack_prev->next = elig_ack->next;
1252 	else
1253 		flow->head = elig_ack->next;
1254 
1255 	skb_mark_not_on_list(elig_ack);
1256 
1257 	return elig_ack;
1258 }
1259 
1260 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1261 {
1262 	avg -= avg >> shift;
1263 	avg += sample >> shift;
1264 	return avg;
1265 }
1266 
1267 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1268 {
1269 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1270 		len -= off;
1271 
1272 	if (q->max_netlen < len)
1273 		q->max_netlen = len;
1274 	if (q->min_netlen > len)
1275 		q->min_netlen = len;
1276 
1277 	len += q->rate_overhead;
1278 
1279 	if (len < q->rate_mpu)
1280 		len = q->rate_mpu;
1281 
1282 	if (q->atm_mode == CAKE_ATM_ATM) {
1283 		len += 47;
1284 		len /= 48;
1285 		len *= 53;
1286 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1287 		/* Add one byte per 64 bytes or part thereof.
1288 		 * This is conservative and easier to calculate than the
1289 		 * precise value.
1290 		 */
1291 		len += (len + 63) / 64;
1292 	}
1293 
1294 	if (q->max_adjlen < len)
1295 		q->max_adjlen = len;
1296 	if (q->min_adjlen > len)
1297 		q->min_adjlen = len;
1298 
1299 	return len;
1300 }
1301 
1302 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1303 {
1304 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1305 	unsigned int hdr_len, last_len = 0;
1306 	u32 off = skb_network_offset(skb);
1307 	u32 len = qdisc_pkt_len(skb);
1308 	u16 segs = 1;
1309 
1310 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1311 
1312 	if (!shinfo->gso_size)
1313 		return cake_calc_overhead(q, len, off);
1314 
1315 	/* borrowed from qdisc_pkt_len_init() */
1316 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1317 
1318 	/* + transport layer */
1319 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1320 						SKB_GSO_TCPV6))) {
1321 		const struct tcphdr *th;
1322 		struct tcphdr _tcphdr;
1323 
1324 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1325 					sizeof(_tcphdr), &_tcphdr);
1326 		if (likely(th))
1327 			hdr_len += __tcp_hdrlen(th);
1328 	} else {
1329 		struct udphdr _udphdr;
1330 
1331 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1332 				       sizeof(_udphdr), &_udphdr))
1333 			hdr_len += sizeof(struct udphdr);
1334 	}
1335 
1336 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1337 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1338 				    shinfo->gso_size);
1339 	else
1340 		segs = shinfo->gso_segs;
1341 
1342 	len = shinfo->gso_size + hdr_len;
1343 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1344 
1345 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1346 		cake_calc_overhead(q, last_len, off));
1347 }
1348 
1349 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1350 {
1351 	struct cake_heap_entry ii = q->overflow_heap[i];
1352 	struct cake_heap_entry jj = q->overflow_heap[j];
1353 
1354 	q->overflow_heap[i] = jj;
1355 	q->overflow_heap[j] = ii;
1356 
1357 	q->tins[ii.t].overflow_idx[ii.b] = j;
1358 	q->tins[jj.t].overflow_idx[jj.b] = i;
1359 }
1360 
1361 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1362 {
1363 	struct cake_heap_entry ii = q->overflow_heap[i];
1364 
1365 	return q->tins[ii.t].backlogs[ii.b];
1366 }
1367 
1368 static void cake_heapify(struct cake_sched_data *q, u16 i)
1369 {
1370 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1371 	u32 mb = cake_heap_get_backlog(q, i);
1372 	u32 m = i;
1373 
1374 	while (m < a) {
1375 		u32 l = m + m + 1;
1376 		u32 r = l + 1;
1377 
1378 		if (l < a) {
1379 			u32 lb = cake_heap_get_backlog(q, l);
1380 
1381 			if (lb > mb) {
1382 				m  = l;
1383 				mb = lb;
1384 			}
1385 		}
1386 
1387 		if (r < a) {
1388 			u32 rb = cake_heap_get_backlog(q, r);
1389 
1390 			if (rb > mb) {
1391 				m  = r;
1392 				mb = rb;
1393 			}
1394 		}
1395 
1396 		if (m != i) {
1397 			cake_heap_swap(q, i, m);
1398 			i = m;
1399 		} else {
1400 			break;
1401 		}
1402 	}
1403 }
1404 
1405 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1406 {
1407 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1408 		u16 p = (i - 1) >> 1;
1409 		u32 ib = cake_heap_get_backlog(q, i);
1410 		u32 pb = cake_heap_get_backlog(q, p);
1411 
1412 		if (ib > pb) {
1413 			cake_heap_swap(q, i, p);
1414 			i = p;
1415 		} else {
1416 			break;
1417 		}
1418 	}
1419 }
1420 
1421 static int cake_advance_shaper(struct cake_sched_data *q,
1422 			       struct cake_tin_data *b,
1423 			       struct sk_buff *skb,
1424 			       ktime_t now, bool drop)
1425 {
1426 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1427 
1428 	/* charge packet bandwidth to this tin
1429 	 * and to the global shaper.
1430 	 */
1431 	if (q->rate_ns) {
1432 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1433 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1434 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1435 
1436 		if (ktime_before(b->time_next_packet, now))
1437 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1438 							   tin_dur);
1439 
1440 		else if (ktime_before(b->time_next_packet,
1441 				      ktime_add_ns(now, tin_dur)))
1442 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1443 
1444 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1445 						   global_dur);
1446 		if (!drop)
1447 			q->failsafe_next_packet = \
1448 				ktime_add_ns(q->failsafe_next_packet,
1449 					     failsafe_dur);
1450 	}
1451 	return len;
1452 }
1453 
1454 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1455 {
1456 	struct cake_sched_data *q = qdisc_priv(sch);
1457 	ktime_t now = ktime_get();
1458 	u32 idx = 0, tin = 0, len;
1459 	struct cake_heap_entry qq;
1460 	struct cake_tin_data *b;
1461 	struct cake_flow *flow;
1462 	struct sk_buff *skb;
1463 
1464 	if (!q->overflow_timeout) {
1465 		int i;
1466 		/* Build fresh max-heap */
1467 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1468 			cake_heapify(q, i);
1469 	}
1470 	q->overflow_timeout = 65535;
1471 
1472 	/* select longest queue for pruning */
1473 	qq  = q->overflow_heap[0];
1474 	tin = qq.t;
1475 	idx = qq.b;
1476 
1477 	b = &q->tins[tin];
1478 	flow = &b->flows[idx];
1479 	skb = dequeue_head(flow);
1480 	if (unlikely(!skb)) {
1481 		/* heap has gone wrong, rebuild it next time */
1482 		q->overflow_timeout = 0;
1483 		return idx + (tin << 16);
1484 	}
1485 
1486 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1487 		b->unresponsive_flow_count++;
1488 
1489 	len = qdisc_pkt_len(skb);
1490 	q->buffer_used      -= skb->truesize;
1491 	b->backlogs[idx]    -= len;
1492 	b->tin_backlog      -= len;
1493 	sch->qstats.backlog -= len;
1494 	qdisc_tree_reduce_backlog(sch, 1, len);
1495 
1496 	flow->dropped++;
1497 	b->tin_dropped++;
1498 	sch->qstats.drops++;
1499 
1500 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1501 		cake_advance_shaper(q, b, skb, now, true);
1502 
1503 	__qdisc_drop(skb, to_free);
1504 	sch->q.qlen--;
1505 
1506 	cake_heapify(q, 0);
1507 
1508 	return idx + (tin << 16);
1509 }
1510 
1511 static void cake_wash_diffserv(struct sk_buff *skb)
1512 {
1513 	switch (skb->protocol) {
1514 	case htons(ETH_P_IP):
1515 		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1516 		break;
1517 	case htons(ETH_P_IPV6):
1518 		ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1519 		break;
1520 	default:
1521 		break;
1522 	}
1523 }
1524 
1525 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1526 {
1527 	u8 dscp;
1528 
1529 	switch (skb->protocol) {
1530 	case htons(ETH_P_IP):
1531 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1532 		if (wash && dscp)
1533 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1534 		return dscp;
1535 
1536 	case htons(ETH_P_IPV6):
1537 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1538 		if (wash && dscp)
1539 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1540 		return dscp;
1541 
1542 	case htons(ETH_P_ARP):
1543 		return 0x38;  /* CS7 - Net Control */
1544 
1545 	default:
1546 		/* If there is no Diffserv field, treat as best-effort */
1547 		return 0;
1548 	}
1549 }
1550 
1551 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1552 					     struct sk_buff *skb)
1553 {
1554 	struct cake_sched_data *q = qdisc_priv(sch);
1555 	u32 tin;
1556 
1557 	if (TC_H_MAJ(skb->priority) == sch->handle &&
1558 	    TC_H_MIN(skb->priority) > 0 &&
1559 	    TC_H_MIN(skb->priority) <= q->tin_cnt) {
1560 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1561 
1562 		if (q->rate_flags & CAKE_FLAG_WASH)
1563 			cake_wash_diffserv(skb);
1564 	} else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1565 		/* extract the Diffserv Precedence field, if it exists */
1566 		/* and clear DSCP bits if washing */
1567 		tin = q->tin_index[cake_handle_diffserv(skb,
1568 				q->rate_flags & CAKE_FLAG_WASH)];
1569 		if (unlikely(tin >= q->tin_cnt))
1570 			tin = 0;
1571 	} else {
1572 		tin = 0;
1573 		if (q->rate_flags & CAKE_FLAG_WASH)
1574 			cake_wash_diffserv(skb);
1575 	}
1576 
1577 	return &q->tins[tin];
1578 }
1579 
1580 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1581 			 struct sk_buff *skb, int flow_mode, int *qerr)
1582 {
1583 	struct cake_sched_data *q = qdisc_priv(sch);
1584 	struct tcf_proto *filter;
1585 	struct tcf_result res;
1586 	u16 flow = 0, host = 0;
1587 	int result;
1588 
1589 	filter = rcu_dereference_bh(q->filter_list);
1590 	if (!filter)
1591 		goto hash;
1592 
1593 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1594 	result = tcf_classify(skb, filter, &res, false);
1595 
1596 	if (result >= 0) {
1597 #ifdef CONFIG_NET_CLS_ACT
1598 		switch (result) {
1599 		case TC_ACT_STOLEN:
1600 		case TC_ACT_QUEUED:
1601 		case TC_ACT_TRAP:
1602 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1603 			/* fall through */
1604 		case TC_ACT_SHOT:
1605 			return 0;
1606 		}
1607 #endif
1608 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1609 			flow = TC_H_MIN(res.classid);
1610 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1611 			host = TC_H_MAJ(res.classid) >> 16;
1612 	}
1613 hash:
1614 	*t = cake_select_tin(sch, skb);
1615 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1616 }
1617 
1618 static void cake_reconfigure(struct Qdisc *sch);
1619 
1620 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1621 			struct sk_buff **to_free)
1622 {
1623 	struct cake_sched_data *q = qdisc_priv(sch);
1624 	int len = qdisc_pkt_len(skb);
1625 	int uninitialized_var(ret);
1626 	struct sk_buff *ack = NULL;
1627 	ktime_t now = ktime_get();
1628 	struct cake_tin_data *b;
1629 	struct cake_flow *flow;
1630 	u32 idx;
1631 
1632 	/* choose flow to insert into */
1633 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1634 	if (idx == 0) {
1635 		if (ret & __NET_XMIT_BYPASS)
1636 			qdisc_qstats_drop(sch);
1637 		__qdisc_drop(skb, to_free);
1638 		return ret;
1639 	}
1640 	idx--;
1641 	flow = &b->flows[idx];
1642 
1643 	/* ensure shaper state isn't stale */
1644 	if (!b->tin_backlog) {
1645 		if (ktime_before(b->time_next_packet, now))
1646 			b->time_next_packet = now;
1647 
1648 		if (!sch->q.qlen) {
1649 			if (ktime_before(q->time_next_packet, now)) {
1650 				q->failsafe_next_packet = now;
1651 				q->time_next_packet = now;
1652 			} else if (ktime_after(q->time_next_packet, now) &&
1653 				   ktime_after(q->failsafe_next_packet, now)) {
1654 				u64 next = \
1655 					min(ktime_to_ns(q->time_next_packet),
1656 					    ktime_to_ns(
1657 						   q->failsafe_next_packet));
1658 				sch->qstats.overlimits++;
1659 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1660 			}
1661 		}
1662 	}
1663 
1664 	if (unlikely(len > b->max_skblen))
1665 		b->max_skblen = len;
1666 
1667 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1668 		struct sk_buff *segs, *nskb;
1669 		netdev_features_t features = netif_skb_features(skb);
1670 		unsigned int slen = 0, numsegs = 0;
1671 
1672 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1673 		if (IS_ERR_OR_NULL(segs))
1674 			return qdisc_drop(skb, sch, to_free);
1675 
1676 		while (segs) {
1677 			nskb = segs->next;
1678 			skb_mark_not_on_list(segs);
1679 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1680 			cobalt_set_enqueue_time(segs, now);
1681 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1682 									  segs);
1683 			flow_queue_add(flow, segs);
1684 
1685 			sch->q.qlen++;
1686 			numsegs++;
1687 			slen += segs->len;
1688 			q->buffer_used += segs->truesize;
1689 			b->packets++;
1690 			segs = nskb;
1691 		}
1692 
1693 		/* stats */
1694 		b->bytes	    += slen;
1695 		b->backlogs[idx]    += slen;
1696 		b->tin_backlog      += slen;
1697 		sch->qstats.backlog += slen;
1698 		q->avg_window_bytes += slen;
1699 
1700 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1701 		consume_skb(skb);
1702 	} else {
1703 		/* not splitting */
1704 		cobalt_set_enqueue_time(skb, now);
1705 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1706 		flow_queue_add(flow, skb);
1707 
1708 		if (q->ack_filter)
1709 			ack = cake_ack_filter(q, flow);
1710 
1711 		if (ack) {
1712 			b->ack_drops++;
1713 			sch->qstats.drops++;
1714 			b->bytes += qdisc_pkt_len(ack);
1715 			len -= qdisc_pkt_len(ack);
1716 			q->buffer_used += skb->truesize - ack->truesize;
1717 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1718 				cake_advance_shaper(q, b, ack, now, true);
1719 
1720 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1721 			consume_skb(ack);
1722 		} else {
1723 			sch->q.qlen++;
1724 			q->buffer_used      += skb->truesize;
1725 		}
1726 
1727 		/* stats */
1728 		b->packets++;
1729 		b->bytes	    += len;
1730 		b->backlogs[idx]    += len;
1731 		b->tin_backlog      += len;
1732 		sch->qstats.backlog += len;
1733 		q->avg_window_bytes += len;
1734 	}
1735 
1736 	if (q->overflow_timeout)
1737 		cake_heapify_up(q, b->overflow_idx[idx]);
1738 
1739 	/* incoming bandwidth capacity estimate */
1740 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1741 		u64 packet_interval = \
1742 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1743 
1744 		if (packet_interval > NSEC_PER_SEC)
1745 			packet_interval = NSEC_PER_SEC;
1746 
1747 		/* filter out short-term bursts, eg. wifi aggregation */
1748 		q->avg_packet_interval = \
1749 			cake_ewma(q->avg_packet_interval,
1750 				  packet_interval,
1751 				  (packet_interval > q->avg_packet_interval ?
1752 					  2 : 8));
1753 
1754 		q->last_packet_time = now;
1755 
1756 		if (packet_interval > q->avg_packet_interval) {
1757 			u64 window_interval = \
1758 				ktime_to_ns(ktime_sub(now,
1759 						      q->avg_window_begin));
1760 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1761 
1762 			do_div(b, window_interval);
1763 			q->avg_peak_bandwidth =
1764 				cake_ewma(q->avg_peak_bandwidth, b,
1765 					  b > q->avg_peak_bandwidth ? 2 : 8);
1766 			q->avg_window_bytes = 0;
1767 			q->avg_window_begin = now;
1768 
1769 			if (ktime_after(now,
1770 					ktime_add_ms(q->last_reconfig_time,
1771 						     250))) {
1772 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1773 				cake_reconfigure(sch);
1774 			}
1775 		}
1776 	} else {
1777 		q->avg_window_bytes = 0;
1778 		q->last_packet_time = now;
1779 	}
1780 
1781 	/* flowchain */
1782 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1783 		struct cake_host *srchost = &b->hosts[flow->srchost];
1784 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1785 		u16 host_load = 1;
1786 
1787 		if (!flow->set) {
1788 			list_add_tail(&flow->flowchain, &b->new_flows);
1789 		} else {
1790 			b->decaying_flow_count--;
1791 			list_move_tail(&flow->flowchain, &b->new_flows);
1792 		}
1793 		flow->set = CAKE_SET_SPARSE;
1794 		b->sparse_flow_count++;
1795 
1796 		if (cake_dsrc(q->flow_mode))
1797 			host_load = max(host_load, srchost->srchost_refcnt);
1798 
1799 		if (cake_ddst(q->flow_mode))
1800 			host_load = max(host_load, dsthost->dsthost_refcnt);
1801 
1802 		flow->deficit = (b->flow_quantum *
1803 				 quantum_div[host_load]) >> 16;
1804 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1805 		/* this flow was empty, accounted as a sparse flow, but actually
1806 		 * in the bulk rotation.
1807 		 */
1808 		flow->set = CAKE_SET_BULK;
1809 		b->sparse_flow_count--;
1810 		b->bulk_flow_count++;
1811 	}
1812 
1813 	if (q->buffer_used > q->buffer_max_used)
1814 		q->buffer_max_used = q->buffer_used;
1815 
1816 	if (q->buffer_used > q->buffer_limit) {
1817 		u32 dropped = 0;
1818 
1819 		while (q->buffer_used > q->buffer_limit) {
1820 			dropped++;
1821 			cake_drop(sch, to_free);
1822 		}
1823 		b->drop_overlimit += dropped;
1824 	}
1825 	return NET_XMIT_SUCCESS;
1826 }
1827 
1828 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1829 {
1830 	struct cake_sched_data *q = qdisc_priv(sch);
1831 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1832 	struct cake_flow *flow = &b->flows[q->cur_flow];
1833 	struct sk_buff *skb = NULL;
1834 	u32 len;
1835 
1836 	if (flow->head) {
1837 		skb = dequeue_head(flow);
1838 		len = qdisc_pkt_len(skb);
1839 		b->backlogs[q->cur_flow] -= len;
1840 		b->tin_backlog		 -= len;
1841 		sch->qstats.backlog      -= len;
1842 		q->buffer_used		 -= skb->truesize;
1843 		sch->q.qlen--;
1844 
1845 		if (q->overflow_timeout)
1846 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1847 	}
1848 	return skb;
1849 }
1850 
1851 /* Discard leftover packets from a tin no longer in use. */
1852 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1853 {
1854 	struct cake_sched_data *q = qdisc_priv(sch);
1855 	struct sk_buff *skb;
1856 
1857 	q->cur_tin = tin;
1858 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1859 		while (!!(skb = cake_dequeue_one(sch)))
1860 			kfree_skb(skb);
1861 }
1862 
1863 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1864 {
1865 	struct cake_sched_data *q = qdisc_priv(sch);
1866 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1867 	struct cake_host *srchost, *dsthost;
1868 	ktime_t now = ktime_get();
1869 	struct cake_flow *flow;
1870 	struct list_head *head;
1871 	bool first_flow = true;
1872 	struct sk_buff *skb;
1873 	u16 host_load;
1874 	u64 delay;
1875 	u32 len;
1876 
1877 begin:
1878 	if (!sch->q.qlen)
1879 		return NULL;
1880 
1881 	/* global hard shaper */
1882 	if (ktime_after(q->time_next_packet, now) &&
1883 	    ktime_after(q->failsafe_next_packet, now)) {
1884 		u64 next = min(ktime_to_ns(q->time_next_packet),
1885 			       ktime_to_ns(q->failsafe_next_packet));
1886 
1887 		sch->qstats.overlimits++;
1888 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1889 		return NULL;
1890 	}
1891 
1892 	/* Choose a class to work on. */
1893 	if (!q->rate_ns) {
1894 		/* In unlimited mode, can't rely on shaper timings, just balance
1895 		 * with DRR
1896 		 */
1897 		bool wrapped = false, empty = true;
1898 
1899 		while (b->tin_deficit < 0 ||
1900 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1901 			if (b->tin_deficit <= 0)
1902 				b->tin_deficit += b->tin_quantum_band;
1903 			if (b->sparse_flow_count + b->bulk_flow_count)
1904 				empty = false;
1905 
1906 			q->cur_tin++;
1907 			b++;
1908 			if (q->cur_tin >= q->tin_cnt) {
1909 				q->cur_tin = 0;
1910 				b = q->tins;
1911 
1912 				if (wrapped) {
1913 					/* It's possible for q->qlen to be
1914 					 * nonzero when we actually have no
1915 					 * packets anywhere.
1916 					 */
1917 					if (empty)
1918 						return NULL;
1919 				} else {
1920 					wrapped = true;
1921 				}
1922 			}
1923 		}
1924 	} else {
1925 		/* In shaped mode, choose:
1926 		 * - Highest-priority tin with queue and meeting schedule, or
1927 		 * - The earliest-scheduled tin with queue.
1928 		 */
1929 		ktime_t best_time = KTIME_MAX;
1930 		int tin, best_tin = 0;
1931 
1932 		for (tin = 0; tin < q->tin_cnt; tin++) {
1933 			b = q->tins + tin;
1934 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1935 				ktime_t time_to_pkt = \
1936 					ktime_sub(b->time_next_packet, now);
1937 
1938 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1939 				    ktime_compare(time_to_pkt,
1940 						  best_time) <= 0) {
1941 					best_time = time_to_pkt;
1942 					best_tin = tin;
1943 				}
1944 			}
1945 		}
1946 
1947 		q->cur_tin = best_tin;
1948 		b = q->tins + best_tin;
1949 
1950 		/* No point in going further if no packets to deliver. */
1951 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1952 			return NULL;
1953 	}
1954 
1955 retry:
1956 	/* service this class */
1957 	head = &b->decaying_flows;
1958 	if (!first_flow || list_empty(head)) {
1959 		head = &b->new_flows;
1960 		if (list_empty(head)) {
1961 			head = &b->old_flows;
1962 			if (unlikely(list_empty(head))) {
1963 				head = &b->decaying_flows;
1964 				if (unlikely(list_empty(head)))
1965 					goto begin;
1966 			}
1967 		}
1968 	}
1969 	flow = list_first_entry(head, struct cake_flow, flowchain);
1970 	q->cur_flow = flow - b->flows;
1971 	first_flow = false;
1972 
1973 	/* triple isolation (modified DRR++) */
1974 	srchost = &b->hosts[flow->srchost];
1975 	dsthost = &b->hosts[flow->dsthost];
1976 	host_load = 1;
1977 
1978 	if (cake_dsrc(q->flow_mode))
1979 		host_load = max(host_load, srchost->srchost_refcnt);
1980 
1981 	if (cake_ddst(q->flow_mode))
1982 		host_load = max(host_load, dsthost->dsthost_refcnt);
1983 
1984 	WARN_ON(host_load > CAKE_QUEUES);
1985 
1986 	/* flow isolation (DRR++) */
1987 	if (flow->deficit <= 0) {
1988 		/* The shifted prandom_u32() is a way to apply dithering to
1989 		 * avoid accumulating roundoff errors
1990 		 */
1991 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1992 				  (prandom_u32() >> 16)) >> 16;
1993 		list_move_tail(&flow->flowchain, &b->old_flows);
1994 
1995 		/* Keep all flows with deficits out of the sparse and decaying
1996 		 * rotations.  No non-empty flow can go into the decaying
1997 		 * rotation, so they can't get deficits
1998 		 */
1999 		if (flow->set == CAKE_SET_SPARSE) {
2000 			if (flow->head) {
2001 				b->sparse_flow_count--;
2002 				b->bulk_flow_count++;
2003 				flow->set = CAKE_SET_BULK;
2004 			} else {
2005 				/* we've moved it to the bulk rotation for
2006 				 * correct deficit accounting but we still want
2007 				 * to count it as a sparse flow, not a bulk one.
2008 				 */
2009 				flow->set = CAKE_SET_SPARSE_WAIT;
2010 			}
2011 		}
2012 		goto retry;
2013 	}
2014 
2015 	/* Retrieve a packet via the AQM */
2016 	while (1) {
2017 		skb = cake_dequeue_one(sch);
2018 		if (!skb) {
2019 			/* this queue was actually empty */
2020 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2021 				b->unresponsive_flow_count--;
2022 
2023 			if (flow->cvars.p_drop || flow->cvars.count ||
2024 			    ktime_before(now, flow->cvars.drop_next)) {
2025 				/* keep in the flowchain until the state has
2026 				 * decayed to rest
2027 				 */
2028 				list_move_tail(&flow->flowchain,
2029 					       &b->decaying_flows);
2030 				if (flow->set == CAKE_SET_BULK) {
2031 					b->bulk_flow_count--;
2032 					b->decaying_flow_count++;
2033 				} else if (flow->set == CAKE_SET_SPARSE ||
2034 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2035 					b->sparse_flow_count--;
2036 					b->decaying_flow_count++;
2037 				}
2038 				flow->set = CAKE_SET_DECAYING;
2039 			} else {
2040 				/* remove empty queue from the flowchain */
2041 				list_del_init(&flow->flowchain);
2042 				if (flow->set == CAKE_SET_SPARSE ||
2043 				    flow->set == CAKE_SET_SPARSE_WAIT)
2044 					b->sparse_flow_count--;
2045 				else if (flow->set == CAKE_SET_BULK)
2046 					b->bulk_flow_count--;
2047 				else
2048 					b->decaying_flow_count--;
2049 
2050 				flow->set = CAKE_SET_NONE;
2051 				srchost->srchost_refcnt--;
2052 				dsthost->dsthost_refcnt--;
2053 			}
2054 			goto begin;
2055 		}
2056 
2057 		/* Last packet in queue may be marked, shouldn't be dropped */
2058 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2059 					(b->bulk_flow_count *
2060 					 !!(q->rate_flags &
2061 					    CAKE_FLAG_INGRESS))) ||
2062 		    !flow->head)
2063 			break;
2064 
2065 		/* drop this packet, get another one */
2066 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2067 			len = cake_advance_shaper(q, b, skb,
2068 						  now, true);
2069 			flow->deficit -= len;
2070 			b->tin_deficit -= len;
2071 		}
2072 		flow->dropped++;
2073 		b->tin_dropped++;
2074 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2075 		qdisc_qstats_drop(sch);
2076 		kfree_skb(skb);
2077 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2078 			goto retry;
2079 	}
2080 
2081 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2082 	qdisc_bstats_update(sch, skb);
2083 
2084 	/* collect delay stats */
2085 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2086 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2087 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2088 				  delay > b->peak_delay ? 2 : 8);
2089 	b->base_delay = cake_ewma(b->base_delay, delay,
2090 				  delay < b->base_delay ? 2 : 8);
2091 
2092 	len = cake_advance_shaper(q, b, skb, now, false);
2093 	flow->deficit -= len;
2094 	b->tin_deficit -= len;
2095 
2096 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2097 		u64 next = min(ktime_to_ns(q->time_next_packet),
2098 			       ktime_to_ns(q->failsafe_next_packet));
2099 
2100 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2101 	} else if (!sch->q.qlen) {
2102 		int i;
2103 
2104 		for (i = 0; i < q->tin_cnt; i++) {
2105 			if (q->tins[i].decaying_flow_count) {
2106 				ktime_t next = \
2107 					ktime_add_ns(now,
2108 						     q->tins[i].cparams.target);
2109 
2110 				qdisc_watchdog_schedule_ns(&q->watchdog,
2111 							   ktime_to_ns(next));
2112 				break;
2113 			}
2114 		}
2115 	}
2116 
2117 	if (q->overflow_timeout)
2118 		q->overflow_timeout--;
2119 
2120 	return skb;
2121 }
2122 
2123 static void cake_reset(struct Qdisc *sch)
2124 {
2125 	u32 c;
2126 
2127 	for (c = 0; c < CAKE_MAX_TINS; c++)
2128 		cake_clear_tin(sch, c);
2129 }
2130 
2131 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2132 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2133 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2134 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2135 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2136 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2137 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2138 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2139 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2140 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2141 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2142 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2143 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2144 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2145 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2146 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2147 };
2148 
2149 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2150 			  u64 target_ns, u64 rtt_est_ns)
2151 {
2152 	/* convert byte-rate into time-per-byte
2153 	 * so it will always unwedge in reasonable time.
2154 	 */
2155 	static const u64 MIN_RATE = 64;
2156 	u32 byte_target = mtu;
2157 	u64 byte_target_ns;
2158 	u8  rate_shft = 0;
2159 	u64 rate_ns = 0;
2160 
2161 	b->flow_quantum = 1514;
2162 	if (rate) {
2163 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2164 		rate_shft = 34;
2165 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2166 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2167 		while (!!(rate_ns >> 34)) {
2168 			rate_ns >>= 1;
2169 			rate_shft--;
2170 		}
2171 	} /* else unlimited, ie. zero delay */
2172 
2173 	b->tin_rate_bps  = rate;
2174 	b->tin_rate_ns   = rate_ns;
2175 	b->tin_rate_shft = rate_shft;
2176 
2177 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2178 
2179 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2180 	b->cparams.interval = max(rtt_est_ns +
2181 				     b->cparams.target - target_ns,
2182 				     b->cparams.target * 2);
2183 	b->cparams.mtu_time = byte_target_ns;
2184 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2185 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2186 }
2187 
2188 static int cake_config_besteffort(struct Qdisc *sch)
2189 {
2190 	struct cake_sched_data *q = qdisc_priv(sch);
2191 	struct cake_tin_data *b = &q->tins[0];
2192 	u32 mtu = psched_mtu(qdisc_dev(sch));
2193 	u64 rate = q->rate_bps;
2194 
2195 	q->tin_cnt = 1;
2196 
2197 	q->tin_index = besteffort;
2198 	q->tin_order = normal_order;
2199 
2200 	cake_set_rate(b, rate, mtu,
2201 		      us_to_ns(q->target), us_to_ns(q->interval));
2202 	b->tin_quantum_band = 65535;
2203 	b->tin_quantum_prio = 65535;
2204 
2205 	return 0;
2206 }
2207 
2208 static int cake_config_precedence(struct Qdisc *sch)
2209 {
2210 	/* convert high-level (user visible) parameters into internal format */
2211 	struct cake_sched_data *q = qdisc_priv(sch);
2212 	u32 mtu = psched_mtu(qdisc_dev(sch));
2213 	u64 rate = q->rate_bps;
2214 	u32 quantum1 = 256;
2215 	u32 quantum2 = 256;
2216 	u32 i;
2217 
2218 	q->tin_cnt = 8;
2219 	q->tin_index = precedence;
2220 	q->tin_order = normal_order;
2221 
2222 	for (i = 0; i < q->tin_cnt; i++) {
2223 		struct cake_tin_data *b = &q->tins[i];
2224 
2225 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2226 			      us_to_ns(q->interval));
2227 
2228 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2229 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2230 
2231 		/* calculate next class's parameters */
2232 		rate  *= 7;
2233 		rate >>= 3;
2234 
2235 		quantum1  *= 3;
2236 		quantum1 >>= 1;
2237 
2238 		quantum2  *= 7;
2239 		quantum2 >>= 3;
2240 	}
2241 
2242 	return 0;
2243 }
2244 
2245 /*	List of known Diffserv codepoints:
2246  *
2247  *	Least Effort (CS1)
2248  *	Best Effort (CS0)
2249  *	Max Reliability & LLT "Lo" (TOS1)
2250  *	Max Throughput (TOS2)
2251  *	Min Delay (TOS4)
2252  *	LLT "La" (TOS5)
2253  *	Assured Forwarding 1 (AF1x) - x3
2254  *	Assured Forwarding 2 (AF2x) - x3
2255  *	Assured Forwarding 3 (AF3x) - x3
2256  *	Assured Forwarding 4 (AF4x) - x3
2257  *	Precedence Class 2 (CS2)
2258  *	Precedence Class 3 (CS3)
2259  *	Precedence Class 4 (CS4)
2260  *	Precedence Class 5 (CS5)
2261  *	Precedence Class 6 (CS6)
2262  *	Precedence Class 7 (CS7)
2263  *	Voice Admit (VA)
2264  *	Expedited Forwarding (EF)
2265 
2266  *	Total 25 codepoints.
2267  */
2268 
2269 /*	List of traffic classes in RFC 4594:
2270  *		(roughly descending order of contended priority)
2271  *		(roughly ascending order of uncontended throughput)
2272  *
2273  *	Network Control (CS6,CS7)      - routing traffic
2274  *	Telephony (EF,VA)         - aka. VoIP streams
2275  *	Signalling (CS5)               - VoIP setup
2276  *	Multimedia Conferencing (AF4x) - aka. video calls
2277  *	Realtime Interactive (CS4)     - eg. games
2278  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2279  *	Broadcast Video (CS3)
2280  *	Low Latency Data (AF2x,TOS4)      - eg. database
2281  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2282  *	Standard Service (CS0 & unrecognised codepoints)
2283  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2284  *	Low Priority Data (CS1)           - eg. BitTorrent
2285 
2286  *	Total 12 traffic classes.
2287  */
2288 
2289 static int cake_config_diffserv8(struct Qdisc *sch)
2290 {
2291 /*	Pruned list of traffic classes for typical applications:
2292  *
2293  *		Network Control          (CS6, CS7)
2294  *		Minimum Latency          (EF, VA, CS5, CS4)
2295  *		Interactive Shell        (CS2, TOS1)
2296  *		Low Latency Transactions (AF2x, TOS4)
2297  *		Video Streaming          (AF4x, AF3x, CS3)
2298  *		Bog Standard             (CS0 etc.)
2299  *		High Throughput          (AF1x, TOS2)
2300  *		Background Traffic       (CS1)
2301  *
2302  *		Total 8 traffic classes.
2303  */
2304 
2305 	struct cake_sched_data *q = qdisc_priv(sch);
2306 	u32 mtu = psched_mtu(qdisc_dev(sch));
2307 	u64 rate = q->rate_bps;
2308 	u32 quantum1 = 256;
2309 	u32 quantum2 = 256;
2310 	u32 i;
2311 
2312 	q->tin_cnt = 8;
2313 
2314 	/* codepoint to class mapping */
2315 	q->tin_index = diffserv8;
2316 	q->tin_order = normal_order;
2317 
2318 	/* class characteristics */
2319 	for (i = 0; i < q->tin_cnt; i++) {
2320 		struct cake_tin_data *b = &q->tins[i];
2321 
2322 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2323 			      us_to_ns(q->interval));
2324 
2325 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2326 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2327 
2328 		/* calculate next class's parameters */
2329 		rate  *= 7;
2330 		rate >>= 3;
2331 
2332 		quantum1  *= 3;
2333 		quantum1 >>= 1;
2334 
2335 		quantum2  *= 7;
2336 		quantum2 >>= 3;
2337 	}
2338 
2339 	return 0;
2340 }
2341 
2342 static int cake_config_diffserv4(struct Qdisc *sch)
2343 {
2344 /*  Further pruned list of traffic classes for four-class system:
2345  *
2346  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2347  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2348  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2349  *	    Background Traffic (CS1)
2350  *
2351  *		Total 4 traffic classes.
2352  */
2353 
2354 	struct cake_sched_data *q = qdisc_priv(sch);
2355 	u32 mtu = psched_mtu(qdisc_dev(sch));
2356 	u64 rate = q->rate_bps;
2357 	u32 quantum = 1024;
2358 
2359 	q->tin_cnt = 4;
2360 
2361 	/* codepoint to class mapping */
2362 	q->tin_index = diffserv4;
2363 	q->tin_order = bulk_order;
2364 
2365 	/* class characteristics */
2366 	cake_set_rate(&q->tins[0], rate, mtu,
2367 		      us_to_ns(q->target), us_to_ns(q->interval));
2368 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2369 		      us_to_ns(q->target), us_to_ns(q->interval));
2370 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2371 		      us_to_ns(q->target), us_to_ns(q->interval));
2372 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2373 		      us_to_ns(q->target), us_to_ns(q->interval));
2374 
2375 	/* priority weights */
2376 	q->tins[0].tin_quantum_prio = quantum;
2377 	q->tins[1].tin_quantum_prio = quantum >> 4;
2378 	q->tins[2].tin_quantum_prio = quantum << 2;
2379 	q->tins[3].tin_quantum_prio = quantum << 4;
2380 
2381 	/* bandwidth-sharing weights */
2382 	q->tins[0].tin_quantum_band = quantum;
2383 	q->tins[1].tin_quantum_band = quantum >> 4;
2384 	q->tins[2].tin_quantum_band = quantum >> 1;
2385 	q->tins[3].tin_quantum_band = quantum >> 2;
2386 
2387 	return 0;
2388 }
2389 
2390 static int cake_config_diffserv3(struct Qdisc *sch)
2391 {
2392 /*  Simplified Diffserv structure with 3 tins.
2393  *		Low Priority		(CS1)
2394  *		Best Effort
2395  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2396  */
2397 	struct cake_sched_data *q = qdisc_priv(sch);
2398 	u32 mtu = psched_mtu(qdisc_dev(sch));
2399 	u64 rate = q->rate_bps;
2400 	u32 quantum = 1024;
2401 
2402 	q->tin_cnt = 3;
2403 
2404 	/* codepoint to class mapping */
2405 	q->tin_index = diffserv3;
2406 	q->tin_order = bulk_order;
2407 
2408 	/* class characteristics */
2409 	cake_set_rate(&q->tins[0], rate, mtu,
2410 		      us_to_ns(q->target), us_to_ns(q->interval));
2411 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2412 		      us_to_ns(q->target), us_to_ns(q->interval));
2413 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2414 		      us_to_ns(q->target), us_to_ns(q->interval));
2415 
2416 	/* priority weights */
2417 	q->tins[0].tin_quantum_prio = quantum;
2418 	q->tins[1].tin_quantum_prio = quantum >> 4;
2419 	q->tins[2].tin_quantum_prio = quantum << 4;
2420 
2421 	/* bandwidth-sharing weights */
2422 	q->tins[0].tin_quantum_band = quantum;
2423 	q->tins[1].tin_quantum_band = quantum >> 4;
2424 	q->tins[2].tin_quantum_band = quantum >> 2;
2425 
2426 	return 0;
2427 }
2428 
2429 static void cake_reconfigure(struct Qdisc *sch)
2430 {
2431 	struct cake_sched_data *q = qdisc_priv(sch);
2432 	int c, ft;
2433 
2434 	switch (q->tin_mode) {
2435 	case CAKE_DIFFSERV_BESTEFFORT:
2436 		ft = cake_config_besteffort(sch);
2437 		break;
2438 
2439 	case CAKE_DIFFSERV_PRECEDENCE:
2440 		ft = cake_config_precedence(sch);
2441 		break;
2442 
2443 	case CAKE_DIFFSERV_DIFFSERV8:
2444 		ft = cake_config_diffserv8(sch);
2445 		break;
2446 
2447 	case CAKE_DIFFSERV_DIFFSERV4:
2448 		ft = cake_config_diffserv4(sch);
2449 		break;
2450 
2451 	case CAKE_DIFFSERV_DIFFSERV3:
2452 	default:
2453 		ft = cake_config_diffserv3(sch);
2454 		break;
2455 	}
2456 
2457 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2458 		cake_clear_tin(sch, c);
2459 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2460 	}
2461 
2462 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2463 	q->rate_shft = q->tins[ft].tin_rate_shft;
2464 
2465 	if (q->buffer_config_limit) {
2466 		q->buffer_limit = q->buffer_config_limit;
2467 	} else if (q->rate_bps) {
2468 		u64 t = q->rate_bps * q->interval;
2469 
2470 		do_div(t, USEC_PER_SEC / 4);
2471 		q->buffer_limit = max_t(u32, t, 4U << 20);
2472 	} else {
2473 		q->buffer_limit = ~0;
2474 	}
2475 
2476 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2477 
2478 	q->buffer_limit = min(q->buffer_limit,
2479 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2480 				  q->buffer_config_limit));
2481 }
2482 
2483 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2484 		       struct netlink_ext_ack *extack)
2485 {
2486 	struct cake_sched_data *q = qdisc_priv(sch);
2487 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2488 	int err;
2489 
2490 	if (!opt)
2491 		return -EINVAL;
2492 
2493 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2494 	if (err < 0)
2495 		return err;
2496 
2497 	if (tb[TCA_CAKE_NAT]) {
2498 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2499 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2500 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2501 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2502 #else
2503 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2504 				    "No conntrack support in kernel");
2505 		return -EOPNOTSUPP;
2506 #endif
2507 	}
2508 
2509 	if (tb[TCA_CAKE_BASE_RATE64])
2510 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2511 
2512 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2513 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2514 
2515 	if (tb[TCA_CAKE_WASH]) {
2516 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2517 			q->rate_flags |= CAKE_FLAG_WASH;
2518 		else
2519 			q->rate_flags &= ~CAKE_FLAG_WASH;
2520 	}
2521 
2522 	if (tb[TCA_CAKE_FLOW_MODE])
2523 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2524 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2525 					CAKE_FLOW_MASK));
2526 
2527 	if (tb[TCA_CAKE_ATM])
2528 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2529 
2530 	if (tb[TCA_CAKE_OVERHEAD]) {
2531 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2532 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2533 
2534 		q->max_netlen = 0;
2535 		q->max_adjlen = 0;
2536 		q->min_netlen = ~0;
2537 		q->min_adjlen = ~0;
2538 	}
2539 
2540 	if (tb[TCA_CAKE_RAW]) {
2541 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2542 
2543 		q->max_netlen = 0;
2544 		q->max_adjlen = 0;
2545 		q->min_netlen = ~0;
2546 		q->min_adjlen = ~0;
2547 	}
2548 
2549 	if (tb[TCA_CAKE_MPU])
2550 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2551 
2552 	if (tb[TCA_CAKE_RTT]) {
2553 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2554 
2555 		if (!q->interval)
2556 			q->interval = 1;
2557 	}
2558 
2559 	if (tb[TCA_CAKE_TARGET]) {
2560 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2561 
2562 		if (!q->target)
2563 			q->target = 1;
2564 	}
2565 
2566 	if (tb[TCA_CAKE_AUTORATE]) {
2567 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2568 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2569 		else
2570 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2571 	}
2572 
2573 	if (tb[TCA_CAKE_INGRESS]) {
2574 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2575 			q->rate_flags |= CAKE_FLAG_INGRESS;
2576 		else
2577 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2578 	}
2579 
2580 	if (tb[TCA_CAKE_ACK_FILTER])
2581 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2582 
2583 	if (tb[TCA_CAKE_MEMORY])
2584 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2585 
2586 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2587 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2588 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2589 		else
2590 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2591 	}
2592 
2593 	if (q->tins) {
2594 		sch_tree_lock(sch);
2595 		cake_reconfigure(sch);
2596 		sch_tree_unlock(sch);
2597 	}
2598 
2599 	return 0;
2600 }
2601 
2602 static void cake_destroy(struct Qdisc *sch)
2603 {
2604 	struct cake_sched_data *q = qdisc_priv(sch);
2605 
2606 	qdisc_watchdog_cancel(&q->watchdog);
2607 	tcf_block_put(q->block);
2608 	kvfree(q->tins);
2609 }
2610 
2611 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2612 		     struct netlink_ext_ack *extack)
2613 {
2614 	struct cake_sched_data *q = qdisc_priv(sch);
2615 	int i, j, err;
2616 
2617 	sch->limit = 10240;
2618 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2619 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2620 
2621 	q->rate_bps = 0; /* unlimited by default */
2622 
2623 	q->interval = 100000; /* 100ms default */
2624 	q->target   =   5000; /* 5ms: codel RFC argues
2625 			       * for 5 to 10% of interval
2626 			       */
2627 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2628 	q->cur_tin = 0;
2629 	q->cur_flow  = 0;
2630 
2631 	qdisc_watchdog_init(&q->watchdog, sch);
2632 
2633 	if (opt) {
2634 		int err = cake_change(sch, opt, extack);
2635 
2636 		if (err)
2637 			return err;
2638 	}
2639 
2640 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2641 	if (err)
2642 		return err;
2643 
2644 	quantum_div[0] = ~0;
2645 	for (i = 1; i <= CAKE_QUEUES; i++)
2646 		quantum_div[i] = 65535 / i;
2647 
2648 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2649 			   GFP_KERNEL);
2650 	if (!q->tins)
2651 		goto nomem;
2652 
2653 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2654 		struct cake_tin_data *b = q->tins + i;
2655 
2656 		INIT_LIST_HEAD(&b->new_flows);
2657 		INIT_LIST_HEAD(&b->old_flows);
2658 		INIT_LIST_HEAD(&b->decaying_flows);
2659 		b->sparse_flow_count = 0;
2660 		b->bulk_flow_count = 0;
2661 		b->decaying_flow_count = 0;
2662 
2663 		for (j = 0; j < CAKE_QUEUES; j++) {
2664 			struct cake_flow *flow = b->flows + j;
2665 			u32 k = j * CAKE_MAX_TINS + i;
2666 
2667 			INIT_LIST_HEAD(&flow->flowchain);
2668 			cobalt_vars_init(&flow->cvars);
2669 
2670 			q->overflow_heap[k].t = i;
2671 			q->overflow_heap[k].b = j;
2672 			b->overflow_idx[j] = k;
2673 		}
2674 	}
2675 
2676 	cake_reconfigure(sch);
2677 	q->avg_peak_bandwidth = q->rate_bps;
2678 	q->min_netlen = ~0;
2679 	q->min_adjlen = ~0;
2680 	return 0;
2681 
2682 nomem:
2683 	cake_destroy(sch);
2684 	return -ENOMEM;
2685 }
2686 
2687 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2688 {
2689 	struct cake_sched_data *q = qdisc_priv(sch);
2690 	struct nlattr *opts;
2691 
2692 	opts = nla_nest_start(skb, TCA_OPTIONS);
2693 	if (!opts)
2694 		goto nla_put_failure;
2695 
2696 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2697 			      TCA_CAKE_PAD))
2698 		goto nla_put_failure;
2699 
2700 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2701 			q->flow_mode & CAKE_FLOW_MASK))
2702 		goto nla_put_failure;
2703 
2704 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2705 		goto nla_put_failure;
2706 
2707 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2708 		goto nla_put_failure;
2709 
2710 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2711 		goto nla_put_failure;
2712 
2713 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2714 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2715 		goto nla_put_failure;
2716 
2717 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2718 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2719 		goto nla_put_failure;
2720 
2721 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2722 		goto nla_put_failure;
2723 
2724 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2725 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2726 		goto nla_put_failure;
2727 
2728 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2729 		goto nla_put_failure;
2730 
2731 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2732 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2733 		goto nla_put_failure;
2734 
2735 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2736 		goto nla_put_failure;
2737 
2738 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2739 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2740 			goto nla_put_failure;
2741 
2742 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2743 		goto nla_put_failure;
2744 
2745 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2746 		goto nla_put_failure;
2747 
2748 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2749 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2750 		goto nla_put_failure;
2751 
2752 	return nla_nest_end(skb, opts);
2753 
2754 nla_put_failure:
2755 	return -1;
2756 }
2757 
2758 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2759 {
2760 	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2761 	struct cake_sched_data *q = qdisc_priv(sch);
2762 	struct nlattr *tstats, *ts;
2763 	int i;
2764 
2765 	if (!stats)
2766 		return -1;
2767 
2768 #define PUT_STAT_U32(attr, data) do {				       \
2769 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2770 			goto nla_put_failure;			       \
2771 	} while (0)
2772 #define PUT_STAT_U64(attr, data) do {				       \
2773 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2774 					data, TCA_CAKE_STATS_PAD)) \
2775 			goto nla_put_failure;			       \
2776 	} while (0)
2777 
2778 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2779 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2780 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2781 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2782 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2783 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2784 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2785 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2786 
2787 #undef PUT_STAT_U32
2788 #undef PUT_STAT_U64
2789 
2790 	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2791 	if (!tstats)
2792 		goto nla_put_failure;
2793 
2794 #define PUT_TSTAT_U32(attr, data) do {					\
2795 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2796 			goto nla_put_failure;				\
2797 	} while (0)
2798 #define PUT_TSTAT_U64(attr, data) do {					\
2799 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2800 					data, TCA_CAKE_TIN_STATS_PAD))	\
2801 			goto nla_put_failure;				\
2802 	} while (0)
2803 
2804 	for (i = 0; i < q->tin_cnt; i++) {
2805 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2806 
2807 		ts = nla_nest_start(d->skb, i + 1);
2808 		if (!ts)
2809 			goto nla_put_failure;
2810 
2811 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2812 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2813 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2814 
2815 		PUT_TSTAT_U32(TARGET_US,
2816 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2817 		PUT_TSTAT_U32(INTERVAL_US,
2818 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2819 
2820 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2821 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2822 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2823 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2824 
2825 		PUT_TSTAT_U32(PEAK_DELAY_US,
2826 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2827 		PUT_TSTAT_U32(AVG_DELAY_US,
2828 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2829 		PUT_TSTAT_U32(BASE_DELAY_US,
2830 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2831 
2832 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2833 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2834 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2835 
2836 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2837 					    b->decaying_flow_count);
2838 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2839 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2840 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2841 
2842 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2843 		nla_nest_end(d->skb, ts);
2844 	}
2845 
2846 #undef PUT_TSTAT_U32
2847 #undef PUT_TSTAT_U64
2848 
2849 	nla_nest_end(d->skb, tstats);
2850 	return nla_nest_end(d->skb, stats);
2851 
2852 nla_put_failure:
2853 	nla_nest_cancel(d->skb, stats);
2854 	return -1;
2855 }
2856 
2857 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2858 {
2859 	return NULL;
2860 }
2861 
2862 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2863 {
2864 	return 0;
2865 }
2866 
2867 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2868 			       u32 classid)
2869 {
2870 	return 0;
2871 }
2872 
2873 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2874 {
2875 }
2876 
2877 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2878 					struct netlink_ext_ack *extack)
2879 {
2880 	struct cake_sched_data *q = qdisc_priv(sch);
2881 
2882 	if (cl)
2883 		return NULL;
2884 	return q->block;
2885 }
2886 
2887 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2888 			   struct sk_buff *skb, struct tcmsg *tcm)
2889 {
2890 	tcm->tcm_handle |= TC_H_MIN(cl);
2891 	return 0;
2892 }
2893 
2894 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2895 				 struct gnet_dump *d)
2896 {
2897 	struct cake_sched_data *q = qdisc_priv(sch);
2898 	const struct cake_flow *flow = NULL;
2899 	struct gnet_stats_queue qs = { 0 };
2900 	struct nlattr *stats;
2901 	u32 idx = cl - 1;
2902 
2903 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2904 		const struct cake_tin_data *b = \
2905 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2906 		const struct sk_buff *skb;
2907 
2908 		flow = &b->flows[idx % CAKE_QUEUES];
2909 
2910 		if (flow->head) {
2911 			sch_tree_lock(sch);
2912 			skb = flow->head;
2913 			while (skb) {
2914 				qs.qlen++;
2915 				skb = skb->next;
2916 			}
2917 			sch_tree_unlock(sch);
2918 		}
2919 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2920 		qs.drops = flow->dropped;
2921 	}
2922 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2923 		return -1;
2924 	if (flow) {
2925 		ktime_t now = ktime_get();
2926 
2927 		stats = nla_nest_start(d->skb, TCA_STATS_APP);
2928 		if (!stats)
2929 			return -1;
2930 
2931 #define PUT_STAT_U32(attr, data) do {				       \
2932 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2933 			goto nla_put_failure;			       \
2934 	} while (0)
2935 #define PUT_STAT_S32(attr, data) do {				       \
2936 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2937 			goto nla_put_failure;			       \
2938 	} while (0)
2939 
2940 		PUT_STAT_S32(DEFICIT, flow->deficit);
2941 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2942 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2943 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2944 		if (flow->cvars.p_drop) {
2945 			PUT_STAT_S32(BLUE_TIMER_US,
2946 				     ktime_to_us(
2947 					     ktime_sub(now,
2948 						     flow->cvars.blue_timer)));
2949 		}
2950 		if (flow->cvars.dropping) {
2951 			PUT_STAT_S32(DROP_NEXT_US,
2952 				     ktime_to_us(
2953 					     ktime_sub(now,
2954 						       flow->cvars.drop_next)));
2955 		}
2956 
2957 		if (nla_nest_end(d->skb, stats) < 0)
2958 			return -1;
2959 	}
2960 
2961 	return 0;
2962 
2963 nla_put_failure:
2964 	nla_nest_cancel(d->skb, stats);
2965 	return -1;
2966 }
2967 
2968 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2969 {
2970 	struct cake_sched_data *q = qdisc_priv(sch);
2971 	unsigned int i, j;
2972 
2973 	if (arg->stop)
2974 		return;
2975 
2976 	for (i = 0; i < q->tin_cnt; i++) {
2977 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2978 
2979 		for (j = 0; j < CAKE_QUEUES; j++) {
2980 			if (list_empty(&b->flows[j].flowchain) ||
2981 			    arg->count < arg->skip) {
2982 				arg->count++;
2983 				continue;
2984 			}
2985 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2986 				arg->stop = 1;
2987 				break;
2988 			}
2989 			arg->count++;
2990 		}
2991 	}
2992 }
2993 
2994 static const struct Qdisc_class_ops cake_class_ops = {
2995 	.leaf		=	cake_leaf,
2996 	.find		=	cake_find,
2997 	.tcf_block	=	cake_tcf_block,
2998 	.bind_tcf	=	cake_bind,
2999 	.unbind_tcf	=	cake_unbind,
3000 	.dump		=	cake_dump_class,
3001 	.dump_stats	=	cake_dump_class_stats,
3002 	.walk		=	cake_walk,
3003 };
3004 
3005 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3006 	.cl_ops		=	&cake_class_ops,
3007 	.id		=	"cake",
3008 	.priv_size	=	sizeof(struct cake_sched_data),
3009 	.enqueue	=	cake_enqueue,
3010 	.dequeue	=	cake_dequeue,
3011 	.peek		=	qdisc_peek_dequeued,
3012 	.init		=	cake_init,
3013 	.reset		=	cake_reset,
3014 	.destroy	=	cake_destroy,
3015 	.change		=	cake_change,
3016 	.dump		=	cake_dump,
3017 	.dump_stats	=	cake_dump_stats,
3018 	.owner		=	THIS_MODULE,
3019 };
3020 
3021 static int __init cake_module_init(void)
3022 {
3023 	return register_qdisc(&cake_qdisc_ops);
3024 }
3025 
3026 static void __exit cake_module_exit(void)
3027 {
3028 	unregister_qdisc(&cake_qdisc_ops);
3029 }
3030 
3031 module_init(cake_module_init)
3032 module_exit(cake_module_exit)
3033 MODULE_AUTHOR("Jonathan Morton");
3034 MODULE_LICENSE("Dual BSD/GPL");
3035 MODULE_DESCRIPTION("The CAKE shaper.");
3036