xref: /openbmc/linux/net/sched/sch_cake.c (revision a17922de)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/version.h>
68 #include <linux/if_vlan.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 #define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */
84 
85 /* struct cobalt_params - contains codel and blue parameters
86  * @interval:	codel initial drop rate
87  * @target:     maximum persistent sojourn time & blue update rate
88  * @mtu_time:   serialisation delay of maximum-size packet
89  * @p_inc:      increment of blue drop probability (0.32 fxp)
90  * @p_dec:      decrement of blue drop probability (0.32 fxp)
91  */
92 struct cobalt_params {
93 	u64	interval;
94 	u64	target;
95 	u64	mtu_time;
96 	u32	p_inc;
97 	u32	p_dec;
98 };
99 
100 /* struct cobalt_vars - contains codel and blue variables
101  * @count:		codel dropping frequency
102  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
103  * @drop_next:		time to drop next packet, or when we dropped last
104  * @blue_timer:		Blue time to next drop
105  * @p_drop:		BLUE drop probability (0.32 fxp)
106  * @dropping:		set if in dropping state
107  * @ecn_marked:		set if marked
108  */
109 struct cobalt_vars {
110 	u32	count;
111 	u32	rec_inv_sqrt;
112 	ktime_t	drop_next;
113 	ktime_t	blue_timer;
114 	u32     p_drop;
115 	bool	dropping;
116 	bool    ecn_marked;
117 };
118 
119 enum {
120 	CAKE_SET_NONE = 0,
121 	CAKE_SET_SPARSE,
122 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 	CAKE_SET_BULK,
124 	CAKE_SET_DECAYING
125 };
126 
127 struct cake_flow {
128 	/* this stuff is all needed per-flow at dequeue time */
129 	struct sk_buff	  *head;
130 	struct sk_buff	  *tail;
131 	struct list_head  flowchain;
132 	s32		  deficit;
133 	u32		  dropped;
134 	struct cobalt_vars cvars;
135 	u16		  srchost; /* index into cake_host table */
136 	u16		  dsthost;
137 	u8		  set;
138 }; /* please try to keep this structure <= 64 bytes */
139 
140 struct cake_host {
141 	u32 srchost_tag;
142 	u32 dsthost_tag;
143 	u16 srchost_refcnt;
144 	u16 dsthost_refcnt;
145 };
146 
147 struct cake_heap_entry {
148 	u16 t:3, b:10;
149 };
150 
151 struct cake_tin_data {
152 	struct cake_flow flows[CAKE_QUEUES];
153 	u32	backlogs[CAKE_QUEUES];
154 	u32	tags[CAKE_QUEUES]; /* for set association */
155 	u16	overflow_idx[CAKE_QUEUES];
156 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 	u16	flow_quantum;
158 
159 	struct cobalt_params cparams;
160 	u32	drop_overlimit;
161 	u16	bulk_flow_count;
162 	u16	sparse_flow_count;
163 	u16	decaying_flow_count;
164 	u16	unresponsive_flow_count;
165 
166 	u32	max_skblen;
167 
168 	struct list_head new_flows;
169 	struct list_head old_flows;
170 	struct list_head decaying_flows;
171 
172 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 	ktime_t	time_next_packet;
174 	u64	tin_rate_ns;
175 	u64	tin_rate_bps;
176 	u16	tin_rate_shft;
177 
178 	u16	tin_quantum_prio;
179 	u16	tin_quantum_band;
180 	s32	tin_deficit;
181 	u32	tin_backlog;
182 	u32	tin_dropped;
183 	u32	tin_ecn_mark;
184 
185 	u32	packets;
186 	u64	bytes;
187 
188 	u32	ack_drops;
189 
190 	/* moving averages */
191 	u64 avge_delay;
192 	u64 peak_delay;
193 	u64 base_delay;
194 
195 	/* hash function stats */
196 	u32	way_directs;
197 	u32	way_hits;
198 	u32	way_misses;
199 	u32	way_collisions;
200 }; /* number of tins is small, so size of this struct doesn't matter much */
201 
202 struct cake_sched_data {
203 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
204 	struct tcf_block *block;
205 	struct cake_tin_data *tins;
206 
207 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
208 	u16		overflow_timeout;
209 
210 	u16		tin_cnt;
211 	u8		tin_mode;
212 	u8		flow_mode;
213 	u8		ack_filter;
214 	u8		atm_mode;
215 
216 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217 	u16		rate_shft;
218 	ktime_t		time_next_packet;
219 	ktime_t		failsafe_next_packet;
220 	u64		rate_ns;
221 	u64		rate_bps;
222 	u16		rate_flags;
223 	s16		rate_overhead;
224 	u16		rate_mpu;
225 	u64		interval;
226 	u64		target;
227 
228 	/* resource tracking */
229 	u32		buffer_used;
230 	u32		buffer_max_used;
231 	u32		buffer_limit;
232 	u32		buffer_config_limit;
233 
234 	/* indices for dequeue */
235 	u16		cur_tin;
236 	u16		cur_flow;
237 
238 	struct qdisc_watchdog watchdog;
239 	const u8	*tin_index;
240 	const u8	*tin_order;
241 
242 	/* bandwidth capacity estimate */
243 	ktime_t		last_packet_time;
244 	ktime_t		avg_window_begin;
245 	u64		avg_packet_interval;
246 	u64		avg_window_bytes;
247 	u64		avg_peak_bandwidth;
248 	ktime_t		last_reconfig_time;
249 
250 	/* packet length stats */
251 	u32		avg_netoff;
252 	u16		max_netlen;
253 	u16		max_adjlen;
254 	u16		min_netlen;
255 	u16		min_adjlen;
256 };
257 
258 enum {
259 	CAKE_FLAG_OVERHEAD	   = BIT(0),
260 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261 	CAKE_FLAG_INGRESS	   = BIT(2),
262 	CAKE_FLAG_WASH		   = BIT(3),
263 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
264 };
265 
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271 
272 struct cobalt_skb_cb {
273 	ktime_t enqueue_time;
274 	u32     adjusted_len;
275 };
276 
277 static u64 us_to_ns(u64 us)
278 {
279 	return us * NSEC_PER_USEC;
280 }
281 
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287 
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290 	return get_cobalt_cb(skb)->enqueue_time;
291 }
292 
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294 				    ktime_t now)
295 {
296 	get_cobalt_cb(skb)->enqueue_time = now;
297 }
298 
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300 
301 /* Diffserv lookup tables */
302 
303 static const u8 precedence[] = {
304 	0, 0, 0, 0, 0, 0, 0, 0,
305 	1, 1, 1, 1, 1, 1, 1, 1,
306 	2, 2, 2, 2, 2, 2, 2, 2,
307 	3, 3, 3, 3, 3, 3, 3, 3,
308 	4, 4, 4, 4, 4, 4, 4, 4,
309 	5, 5, 5, 5, 5, 5, 5, 5,
310 	6, 6, 6, 6, 6, 6, 6, 6,
311 	7, 7, 7, 7, 7, 7, 7, 7,
312 };
313 
314 static const u8 diffserv8[] = {
315 	2, 5, 1, 2, 4, 2, 2, 2,
316 	0, 2, 1, 2, 1, 2, 1, 2,
317 	5, 2, 4, 2, 4, 2, 4, 2,
318 	3, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 2, 2, 6, 2, 6, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 };
324 
325 static const u8 diffserv4[] = {
326 	0, 2, 0, 0, 2, 0, 0, 0,
327 	1, 0, 0, 0, 0, 0, 0, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 0, 0, 3, 0, 3, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 };
335 
336 static const u8 diffserv3[] = {
337 	0, 0, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 2, 0, 2, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 besteffort[] = {
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 /* tin priority order for stats dumping */
359 
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362 
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365 
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371 
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374 	u32 invsqrt, invsqrt2;
375 	u64 val;
376 
377 	invsqrt = vars->rec_inv_sqrt;
378 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380 
381 	val >>= 2; /* avoid overflow in following multiply */
382 	val = (val * invsqrt) >> (32 - 2 + 1);
383 
384 	vars->rec_inv_sqrt = val;
385 }
386 
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389 	if (vars->count < REC_INV_SQRT_CACHE)
390 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391 	else
392 		cobalt_newton_step(vars);
393 }
394 
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404 
405 static void cobalt_cache_init(void)
406 {
407 	struct cobalt_vars v;
408 
409 	memset(&v, 0, sizeof(v));
410 	v.rec_inv_sqrt = ~0U;
411 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412 
413 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 
419 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420 	}
421 }
422 
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425 	memset(vars, 0, sizeof(*vars));
426 
427 	if (!cobalt_rec_inv_sqrt_cache[0]) {
428 		cobalt_cache_init();
429 		cobalt_rec_inv_sqrt_cache[0] = ~0;
430 	}
431 }
432 
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438 			      u64 interval,
439 			      u32 rec_inv_sqrt)
440 {
441 	return ktime_add_ns(t, reciprocal_scale(interval,
442 						rec_inv_sqrt));
443 }
444 
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449 			      struct cobalt_params *p,
450 			      ktime_t now)
451 {
452 	bool up = false;
453 
454 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455 		up = !vars->p_drop;
456 		vars->p_drop += p->p_inc;
457 		if (vars->p_drop < p->p_inc)
458 			vars->p_drop = ~0;
459 		vars->blue_timer = now;
460 	}
461 	vars->dropping = true;
462 	vars->drop_next = now;
463 	if (!vars->count)
464 		vars->count = 1;
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			vars->p_drop = 0;
482 		else
483 			vars->p_drop -= p->p_dec;
484 		vars->blue_timer = now;
485 		down = !vars->p_drop;
486 	}
487 	vars->dropping = false;
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		vars->count--;
491 		cobalt_invsqrt(vars);
492 		vars->drop_next = cobalt_control(vars->drop_next,
493 						 p->interval,
494 						 vars->rec_inv_sqrt);
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504 			       struct cobalt_params *p,
505 			       ktime_t now,
506 			       struct sk_buff *skb,
507 			       u32 bulk_flows)
508 {
509 	bool next_due, over_target, drop = false;
510 	ktime_t schedule;
511 	u64 sojourn;
512 
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527 
528 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529 	schedule = ktime_sub(now, vars->drop_next);
530 	over_target = sojourn > p->target &&
531 		      sojourn > p->mtu_time * bulk_flows * 2 &&
532 		      sojourn > p->mtu_time * 4;
533 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
534 
535 	vars->ecn_marked = false;
536 
537 	if (over_target) {
538 		if (!vars->dropping) {
539 			vars->dropping = true;
540 			vars->drop_next = cobalt_control(now,
541 							 p->interval,
542 							 vars->rec_inv_sqrt);
543 		}
544 		if (!vars->count)
545 			vars->count = 1;
546 	} else if (vars->dropping) {
547 		vars->dropping = false;
548 	}
549 
550 	if (next_due && vars->dropping) {
551 		/* Use ECN mark if possible, otherwise drop */
552 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553 
554 		vars->count++;
555 		if (!vars->count)
556 			vars->count--;
557 		cobalt_invsqrt(vars);
558 		vars->drop_next = cobalt_control(vars->drop_next,
559 						 p->interval,
560 						 vars->rec_inv_sqrt);
561 		schedule = ktime_sub(now, vars->drop_next);
562 	} else {
563 		while (next_due) {
564 			vars->count--;
565 			cobalt_invsqrt(vars);
566 			vars->drop_next = cobalt_control(vars->drop_next,
567 							 p->interval,
568 							 vars->rec_inv_sqrt);
569 			schedule = ktime_sub(now, vars->drop_next);
570 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
571 		}
572 	}
573 
574 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
575 	if (vars->p_drop)
576 		drop |= (prandom_u32() < vars->p_drop);
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && !drop)
582 		vars->drop_next = now;
583 
584 	return drop;
585 }
586 
587 static void cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct;
593 
594 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
595 		return;
596 
597 	if (!nf_ct_get_tuple_skb(&tuple, skb))
598 		return;
599 
600 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
601 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
602 
603 	if (keys->ports.ports) {
604 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
605 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
606 	}
607 #endif
608 }
609 
610 /* Cake has several subtle multiple bit settings. In these cases you
611  *  would be matching triple isolate mode as well.
612  */
613 
614 static bool cake_dsrc(int flow_mode)
615 {
616 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
617 }
618 
619 static bool cake_ddst(int flow_mode)
620 {
621 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
622 }
623 
624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
625 		     int flow_mode)
626 {
627 	u32 flow_hash = 0, srchost_hash, dsthost_hash;
628 	u16 reduced_hash, srchost_idx, dsthost_idx;
629 	struct flow_keys keys, host_keys;
630 
631 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
632 		return 0;
633 
634 	skb_flow_dissect_flow_keys(skb, &keys,
635 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
636 
637 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
638 		cake_update_flowkeys(&keys, skb);
639 
640 	/* flow_hash_from_keys() sorts the addresses by value, so we have
641 	 * to preserve their order in a separate data structure to treat
642 	 * src and dst host addresses as independently selectable.
643 	 */
644 	host_keys = keys;
645 	host_keys.ports.ports     = 0;
646 	host_keys.basic.ip_proto  = 0;
647 	host_keys.keyid.keyid     = 0;
648 	host_keys.tags.flow_label = 0;
649 
650 	switch (host_keys.control.addr_type) {
651 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
652 		host_keys.addrs.v4addrs.src = 0;
653 		dsthost_hash = flow_hash_from_keys(&host_keys);
654 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
655 		host_keys.addrs.v4addrs.dst = 0;
656 		srchost_hash = flow_hash_from_keys(&host_keys);
657 		break;
658 
659 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
660 		memset(&host_keys.addrs.v6addrs.src, 0,
661 		       sizeof(host_keys.addrs.v6addrs.src));
662 		dsthost_hash = flow_hash_from_keys(&host_keys);
663 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
664 		memset(&host_keys.addrs.v6addrs.dst, 0,
665 		       sizeof(host_keys.addrs.v6addrs.dst));
666 		srchost_hash = flow_hash_from_keys(&host_keys);
667 		break;
668 
669 	default:
670 		dsthost_hash = 0;
671 		srchost_hash = 0;
672 	}
673 
674 	/* This *must* be after the above switch, since as a
675 	 * side-effect it sorts the src and dst addresses.
676 	 */
677 	if (flow_mode & CAKE_FLOW_FLOWS)
678 		flow_hash = flow_hash_from_keys(&keys);
679 
680 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
681 		if (flow_mode & CAKE_FLOW_SRC_IP)
682 			flow_hash ^= srchost_hash;
683 
684 		if (flow_mode & CAKE_FLOW_DST_IP)
685 			flow_hash ^= dsthost_hash;
686 	}
687 
688 	reduced_hash = flow_hash % CAKE_QUEUES;
689 
690 	/* set-associative hashing */
691 	/* fast path if no hash collision (direct lookup succeeds) */
692 	if (likely(q->tags[reduced_hash] == flow_hash &&
693 		   q->flows[reduced_hash].set)) {
694 		q->way_directs++;
695 	} else {
696 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
697 		u32 outer_hash = reduced_hash - inner_hash;
698 		bool allocate_src = false;
699 		bool allocate_dst = false;
700 		u32 i, k;
701 
702 		/* check if any active queue in the set is reserved for
703 		 * this flow.
704 		 */
705 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
706 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
707 			if (q->tags[outer_hash + k] == flow_hash) {
708 				if (i)
709 					q->way_hits++;
710 
711 				if (!q->flows[outer_hash + k].set) {
712 					/* need to increment host refcnts */
713 					allocate_src = cake_dsrc(flow_mode);
714 					allocate_dst = cake_ddst(flow_mode);
715 				}
716 
717 				goto found;
718 			}
719 		}
720 
721 		/* no queue is reserved for this flow, look for an
722 		 * empty one.
723 		 */
724 		for (i = 0; i < CAKE_SET_WAYS;
725 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
726 			if (!q->flows[outer_hash + k].set) {
727 				q->way_misses++;
728 				allocate_src = cake_dsrc(flow_mode);
729 				allocate_dst = cake_ddst(flow_mode);
730 				goto found;
731 			}
732 		}
733 
734 		/* With no empty queues, default to the original
735 		 * queue, accept the collision, update the host tags.
736 		 */
737 		q->way_collisions++;
738 		q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
739 		q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
740 		allocate_src = cake_dsrc(flow_mode);
741 		allocate_dst = cake_ddst(flow_mode);
742 found:
743 		/* reserve queue for future packets in same flow */
744 		reduced_hash = outer_hash + k;
745 		q->tags[reduced_hash] = flow_hash;
746 
747 		if (allocate_src) {
748 			srchost_idx = srchost_hash % CAKE_QUEUES;
749 			inner_hash = srchost_idx % CAKE_SET_WAYS;
750 			outer_hash = srchost_idx - inner_hash;
751 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
752 				i++, k = (k + 1) % CAKE_SET_WAYS) {
753 				if (q->hosts[outer_hash + k].srchost_tag ==
754 				    srchost_hash)
755 					goto found_src;
756 			}
757 			for (i = 0; i < CAKE_SET_WAYS;
758 				i++, k = (k + 1) % CAKE_SET_WAYS) {
759 				if (!q->hosts[outer_hash + k].srchost_refcnt)
760 					break;
761 			}
762 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
763 found_src:
764 			srchost_idx = outer_hash + k;
765 			q->hosts[srchost_idx].srchost_refcnt++;
766 			q->flows[reduced_hash].srchost = srchost_idx;
767 		}
768 
769 		if (allocate_dst) {
770 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
771 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
772 			outer_hash = dsthost_idx - inner_hash;
773 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
774 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
775 				if (q->hosts[outer_hash + k].dsthost_tag ==
776 				    dsthost_hash)
777 					goto found_dst;
778 			}
779 			for (i = 0; i < CAKE_SET_WAYS;
780 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
781 				if (!q->hosts[outer_hash + k].dsthost_refcnt)
782 					break;
783 			}
784 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
785 found_dst:
786 			dsthost_idx = outer_hash + k;
787 			q->hosts[dsthost_idx].dsthost_refcnt++;
788 			q->flows[reduced_hash].dsthost = dsthost_idx;
789 		}
790 	}
791 
792 	return reduced_hash;
793 }
794 
795 /* helper functions : might be changed when/if skb use a standard list_head */
796 /* remove one skb from head of slot queue */
797 
798 static struct sk_buff *dequeue_head(struct cake_flow *flow)
799 {
800 	struct sk_buff *skb = flow->head;
801 
802 	if (skb) {
803 		flow->head = skb->next;
804 		skb->next = NULL;
805 	}
806 
807 	return skb;
808 }
809 
810 /* add skb to flow queue (tail add) */
811 
812 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
813 {
814 	if (!flow->head)
815 		flow->head = skb;
816 	else
817 		flow->tail->next = skb;
818 	flow->tail = skb;
819 	skb->next = NULL;
820 }
821 
822 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
823 				    struct ipv6hdr *buf)
824 {
825 	unsigned int offset = skb_network_offset(skb);
826 	struct iphdr *iph;
827 
828 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
829 
830 	if (!iph)
831 		return NULL;
832 
833 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
834 		return skb_header_pointer(skb, offset + iph->ihl * 4,
835 					  sizeof(struct ipv6hdr), buf);
836 
837 	else if (iph->version == 4)
838 		return iph;
839 
840 	else if (iph->version == 6)
841 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
842 					  buf);
843 
844 	return NULL;
845 }
846 
847 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
848 				      void *buf, unsigned int bufsize)
849 {
850 	unsigned int offset = skb_network_offset(skb);
851 	const struct ipv6hdr *ipv6h;
852 	const struct tcphdr *tcph;
853 	const struct iphdr *iph;
854 	struct ipv6hdr _ipv6h;
855 	struct tcphdr _tcph;
856 
857 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
858 
859 	if (!ipv6h)
860 		return NULL;
861 
862 	if (ipv6h->version == 4) {
863 		iph = (struct iphdr *)ipv6h;
864 		offset += iph->ihl * 4;
865 
866 		/* special-case 6in4 tunnelling, as that is a common way to get
867 		 * v6 connectivity in the home
868 		 */
869 		if (iph->protocol == IPPROTO_IPV6) {
870 			ipv6h = skb_header_pointer(skb, offset,
871 						   sizeof(_ipv6h), &_ipv6h);
872 
873 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
874 				return NULL;
875 
876 			offset += sizeof(struct ipv6hdr);
877 
878 		} else if (iph->protocol != IPPROTO_TCP) {
879 			return NULL;
880 		}
881 
882 	} else if (ipv6h->version == 6) {
883 		if (ipv6h->nexthdr != IPPROTO_TCP)
884 			return NULL;
885 
886 		offset += sizeof(struct ipv6hdr);
887 	} else {
888 		return NULL;
889 	}
890 
891 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
892 	if (!tcph)
893 		return NULL;
894 
895 	return skb_header_pointer(skb, offset,
896 				  min(__tcp_hdrlen(tcph), bufsize), buf);
897 }
898 
899 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
900 				   int code, int *oplen)
901 {
902 	/* inspired by tcp_parse_options in tcp_input.c */
903 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
904 	const u8 *ptr = (const u8 *)(tcph + 1);
905 
906 	while (length > 0) {
907 		int opcode = *ptr++;
908 		int opsize;
909 
910 		if (opcode == TCPOPT_EOL)
911 			break;
912 		if (opcode == TCPOPT_NOP) {
913 			length--;
914 			continue;
915 		}
916 		opsize = *ptr++;
917 		if (opsize < 2 || opsize > length)
918 			break;
919 
920 		if (opcode == code) {
921 			*oplen = opsize;
922 			return ptr;
923 		}
924 
925 		ptr += opsize - 2;
926 		length -= opsize;
927 	}
928 
929 	return NULL;
930 }
931 
932 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
933  * bytes than the other. In the case where both sequences ACKs bytes that the
934  * other doesn't, A is considered greater. DSACKs in A also makes A be
935  * considered greater.
936  *
937  * @return -1, 0 or 1 as normal compare functions
938  */
939 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
940 				  const struct tcphdr *tcph_b)
941 {
942 	const struct tcp_sack_block_wire *sack_a, *sack_b;
943 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
944 	u32 bytes_a = 0, bytes_b = 0;
945 	int oplen_a, oplen_b;
946 	bool first = true;
947 
948 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
949 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
950 
951 	/* pointers point to option contents */
952 	oplen_a -= TCPOLEN_SACK_BASE;
953 	oplen_b -= TCPOLEN_SACK_BASE;
954 
955 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
956 	    (!sack_b || oplen_b < sizeof(*sack_b)))
957 		return -1;
958 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
959 		 (!sack_a || oplen_a < sizeof(*sack_a)))
960 		return 1;
961 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
962 		 (!sack_b || oplen_b < sizeof(*sack_b)))
963 		return 0;
964 
965 	while (oplen_a >= sizeof(*sack_a)) {
966 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
967 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
968 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
969 		int oplen_tmp = oplen_b;
970 		bool found = false;
971 
972 		/* DSACK; always considered greater to prevent dropping */
973 		if (before(start_a, ack_seq_a))
974 			return -1;
975 
976 		bytes_a += end_a - start_a;
977 
978 		while (oplen_tmp >= sizeof(*sack_tmp)) {
979 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
980 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
981 
982 			/* first time through we count the total size */
983 			if (first)
984 				bytes_b += end_b - start_b;
985 
986 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
987 				found = true;
988 				if (!first)
989 					break;
990 			}
991 			oplen_tmp -= sizeof(*sack_tmp);
992 			sack_tmp++;
993 		}
994 
995 		if (!found)
996 			return -1;
997 
998 		oplen_a -= sizeof(*sack_a);
999 		sack_a++;
1000 		first = false;
1001 	}
1002 
1003 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1004 	 * either the SACKs are equal, or B SACKs more bytes.
1005 	 */
1006 	return bytes_b > bytes_a ? 1 : 0;
1007 }
1008 
1009 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1010 				 u32 *tsval, u32 *tsecr)
1011 {
1012 	const u8 *ptr;
1013 	int opsize;
1014 
1015 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1016 
1017 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1018 		*tsval = get_unaligned_be32(ptr);
1019 		*tsecr = get_unaligned_be32(ptr + 4);
1020 	}
1021 }
1022 
1023 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1024 			       u32 tstamp_new, u32 tsecr_new)
1025 {
1026 	/* inspired by tcp_parse_options in tcp_input.c */
1027 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1028 	const u8 *ptr = (const u8 *)(tcph + 1);
1029 	u32 tstamp, tsecr;
1030 
1031 	/* 3 reserved flags must be unset to avoid future breakage
1032 	 * ACK must be set
1033 	 * ECE/CWR are handled separately
1034 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1035 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1036 	 * 0x00C00000 = CWR/ECE (handled separately)
1037 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1038 	 */
1039 	if (((tcp_flag_word(tcph) &
1040 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1041 		return false;
1042 
1043 	while (length > 0) {
1044 		int opcode = *ptr++;
1045 		int opsize;
1046 
1047 		if (opcode == TCPOPT_EOL)
1048 			break;
1049 		if (opcode == TCPOPT_NOP) {
1050 			length--;
1051 			continue;
1052 		}
1053 		opsize = *ptr++;
1054 		if (opsize < 2 || opsize > length)
1055 			break;
1056 
1057 		switch (opcode) {
1058 		case TCPOPT_MD5SIG: /* doesn't influence state */
1059 			break;
1060 
1061 		case TCPOPT_SACK: /* stricter checking performed later */
1062 			if (opsize % 8 != 2)
1063 				return false;
1064 			break;
1065 
1066 		case TCPOPT_TIMESTAMP:
1067 			/* only drop timestamps lower than new */
1068 			if (opsize != TCPOLEN_TIMESTAMP)
1069 				return false;
1070 			tstamp = get_unaligned_be32(ptr);
1071 			tsecr = get_unaligned_be32(ptr + 4);
1072 			if (after(tstamp, tstamp_new) ||
1073 			    after(tsecr, tsecr_new))
1074 				return false;
1075 			break;
1076 
1077 		case TCPOPT_MSS:  /* these should only be set on SYN */
1078 		case TCPOPT_WINDOW:
1079 		case TCPOPT_SACK_PERM:
1080 		case TCPOPT_FASTOPEN:
1081 		case TCPOPT_EXP:
1082 		default: /* don't drop if any unknown options are present */
1083 			return false;
1084 		}
1085 
1086 		ptr += opsize - 2;
1087 		length -= opsize;
1088 	}
1089 
1090 	return true;
1091 }
1092 
1093 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1094 				       struct cake_flow *flow)
1095 {
1096 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1097 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1098 	struct sk_buff *skb_check, *skb_prev = NULL;
1099 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1100 	unsigned char _tcph[64], _tcph_check[64];
1101 	const struct tcphdr *tcph, *tcph_check;
1102 	const struct iphdr *iph, *iph_check;
1103 	struct ipv6hdr _iph, _iph_check;
1104 	const struct sk_buff *skb;
1105 	int seglen, num_found = 0;
1106 	u32 tstamp = 0, tsecr = 0;
1107 	__be32 elig_flags = 0;
1108 	int sack_comp;
1109 
1110 	/* no other possible ACKs to filter */
1111 	if (flow->head == flow->tail)
1112 		return NULL;
1113 
1114 	skb = flow->tail;
1115 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1116 	iph = cake_get_iphdr(skb, &_iph);
1117 	if (!tcph)
1118 		return NULL;
1119 
1120 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1121 
1122 	/* the 'triggering' packet need only have the ACK flag set.
1123 	 * also check that SYN is not set, as there won't be any previous ACKs.
1124 	 */
1125 	if ((tcp_flag_word(tcph) &
1126 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1127 		return NULL;
1128 
1129 	/* the 'triggering' ACK is at the tail of the queue, we have already
1130 	 * returned if it is the only packet in the flow. loop through the rest
1131 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1132 	 * triggering one.
1133 	 */
1134 	for (skb_check = flow->head;
1135 	     skb_check && skb_check != skb;
1136 	     skb_prev = skb_check, skb_check = skb_check->next) {
1137 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1138 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1139 					     sizeof(_tcph_check));
1140 
1141 		/* only TCP packets with matching 5-tuple are eligible, and only
1142 		 * drop safe headers
1143 		 */
1144 		if (!tcph_check || iph->version != iph_check->version ||
1145 		    tcph_check->source != tcph->source ||
1146 		    tcph_check->dest != tcph->dest)
1147 			continue;
1148 
1149 		if (iph_check->version == 4) {
1150 			if (iph_check->saddr != iph->saddr ||
1151 			    iph_check->daddr != iph->daddr)
1152 				continue;
1153 
1154 			seglen = ntohs(iph_check->tot_len) -
1155 				       (4 * iph_check->ihl);
1156 		} else if (iph_check->version == 6) {
1157 			ipv6h = (struct ipv6hdr *)iph;
1158 			ipv6h_check = (struct ipv6hdr *)iph_check;
1159 
1160 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1161 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1162 				continue;
1163 
1164 			seglen = ntohs(ipv6h_check->payload_len);
1165 		} else {
1166 			WARN_ON(1);  /* shouldn't happen */
1167 			continue;
1168 		}
1169 
1170 		/* If the ECE/CWR flags changed from the previous eligible
1171 		 * packet in the same flow, we should no longer be dropping that
1172 		 * previous packet as this would lose information.
1173 		 */
1174 		if (elig_ack && (tcp_flag_word(tcph_check) &
1175 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1176 			elig_ack = NULL;
1177 			elig_ack_prev = NULL;
1178 			num_found--;
1179 		}
1180 
1181 		/* Check TCP options and flags, don't drop ACKs with segment
1182 		 * data, and don't drop ACKs with a higher cumulative ACK
1183 		 * counter than the triggering packet. Check ACK seqno here to
1184 		 * avoid parsing SACK options of packets we are going to exclude
1185 		 * anyway.
1186 		 */
1187 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1188 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1189 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1190 			continue;
1191 
1192 		/* Check SACK options. The triggering packet must SACK more data
1193 		 * than the ACK under consideration, or SACK the same range but
1194 		 * have a larger cumulative ACK counter. The latter is a
1195 		 * pathological case, but is contained in the following check
1196 		 * anyway, just to be safe.
1197 		 */
1198 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1199 
1200 		if (sack_comp < 0 ||
1201 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1202 		     sack_comp == 0))
1203 			continue;
1204 
1205 		/* At this point we have found an eligible pure ACK to drop; if
1206 		 * we are in aggressive mode, we are done. Otherwise, keep
1207 		 * searching unless this is the second eligible ACK we
1208 		 * found.
1209 		 *
1210 		 * Since we want to drop ACK closest to the head of the queue,
1211 		 * save the first eligible ACK we find, even if we need to loop
1212 		 * again.
1213 		 */
1214 		if (!elig_ack) {
1215 			elig_ack = skb_check;
1216 			elig_ack_prev = skb_prev;
1217 			elig_flags = (tcp_flag_word(tcph_check)
1218 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1219 		}
1220 
1221 		if (num_found++ > 0)
1222 			goto found;
1223 	}
1224 
1225 	/* We made it through the queue without finding two eligible ACKs . If
1226 	 * we found a single eligible ACK we can drop it in aggressive mode if
1227 	 * we can guarantee that this does not interfere with ECN flag
1228 	 * information. We ensure this by dropping it only if the enqueued
1229 	 * packet is consecutive with the eligible ACK, and their flags match.
1230 	 */
1231 	if (elig_ack && aggressive && elig_ack->next == skb &&
1232 	    (elig_flags == (tcp_flag_word(tcph) &
1233 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1234 		goto found;
1235 
1236 	return NULL;
1237 
1238 found:
1239 	if (elig_ack_prev)
1240 		elig_ack_prev->next = elig_ack->next;
1241 	else
1242 		flow->head = elig_ack->next;
1243 
1244 	elig_ack->next = NULL;
1245 
1246 	return elig_ack;
1247 }
1248 
1249 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1250 {
1251 	avg -= avg >> shift;
1252 	avg += sample >> shift;
1253 	return avg;
1254 }
1255 
1256 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1257 {
1258 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1259 		len -= off;
1260 
1261 	if (q->max_netlen < len)
1262 		q->max_netlen = len;
1263 	if (q->min_netlen > len)
1264 		q->min_netlen = len;
1265 
1266 	len += q->rate_overhead;
1267 
1268 	if (len < q->rate_mpu)
1269 		len = q->rate_mpu;
1270 
1271 	if (q->atm_mode == CAKE_ATM_ATM) {
1272 		len += 47;
1273 		len /= 48;
1274 		len *= 53;
1275 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1276 		/* Add one byte per 64 bytes or part thereof.
1277 		 * This is conservative and easier to calculate than the
1278 		 * precise value.
1279 		 */
1280 		len += (len + 63) / 64;
1281 	}
1282 
1283 	if (q->max_adjlen < len)
1284 		q->max_adjlen = len;
1285 	if (q->min_adjlen > len)
1286 		q->min_adjlen = len;
1287 
1288 	return len;
1289 }
1290 
1291 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1292 {
1293 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1294 	unsigned int hdr_len, last_len = 0;
1295 	u32 off = skb_network_offset(skb);
1296 	u32 len = qdisc_pkt_len(skb);
1297 	u16 segs = 1;
1298 
1299 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1300 
1301 	if (!shinfo->gso_size)
1302 		return cake_calc_overhead(q, len, off);
1303 
1304 	/* borrowed from qdisc_pkt_len_init() */
1305 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1306 
1307 	/* + transport layer */
1308 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1309 						SKB_GSO_TCPV6))) {
1310 		const struct tcphdr *th;
1311 		struct tcphdr _tcphdr;
1312 
1313 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1314 					sizeof(_tcphdr), &_tcphdr);
1315 		if (likely(th))
1316 			hdr_len += __tcp_hdrlen(th);
1317 	} else {
1318 		struct udphdr _udphdr;
1319 
1320 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1321 				       sizeof(_udphdr), &_udphdr))
1322 			hdr_len += sizeof(struct udphdr);
1323 	}
1324 
1325 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1326 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1327 				    shinfo->gso_size);
1328 	else
1329 		segs = shinfo->gso_segs;
1330 
1331 	len = shinfo->gso_size + hdr_len;
1332 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1333 
1334 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1335 		cake_calc_overhead(q, last_len, off));
1336 }
1337 
1338 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1339 {
1340 	struct cake_heap_entry ii = q->overflow_heap[i];
1341 	struct cake_heap_entry jj = q->overflow_heap[j];
1342 
1343 	q->overflow_heap[i] = jj;
1344 	q->overflow_heap[j] = ii;
1345 
1346 	q->tins[ii.t].overflow_idx[ii.b] = j;
1347 	q->tins[jj.t].overflow_idx[jj.b] = i;
1348 }
1349 
1350 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1351 {
1352 	struct cake_heap_entry ii = q->overflow_heap[i];
1353 
1354 	return q->tins[ii.t].backlogs[ii.b];
1355 }
1356 
1357 static void cake_heapify(struct cake_sched_data *q, u16 i)
1358 {
1359 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1360 	u32 mb = cake_heap_get_backlog(q, i);
1361 	u32 m = i;
1362 
1363 	while (m < a) {
1364 		u32 l = m + m + 1;
1365 		u32 r = l + 1;
1366 
1367 		if (l < a) {
1368 			u32 lb = cake_heap_get_backlog(q, l);
1369 
1370 			if (lb > mb) {
1371 				m  = l;
1372 				mb = lb;
1373 			}
1374 		}
1375 
1376 		if (r < a) {
1377 			u32 rb = cake_heap_get_backlog(q, r);
1378 
1379 			if (rb > mb) {
1380 				m  = r;
1381 				mb = rb;
1382 			}
1383 		}
1384 
1385 		if (m != i) {
1386 			cake_heap_swap(q, i, m);
1387 			i = m;
1388 		} else {
1389 			break;
1390 		}
1391 	}
1392 }
1393 
1394 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1395 {
1396 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1397 		u16 p = (i - 1) >> 1;
1398 		u32 ib = cake_heap_get_backlog(q, i);
1399 		u32 pb = cake_heap_get_backlog(q, p);
1400 
1401 		if (ib > pb) {
1402 			cake_heap_swap(q, i, p);
1403 			i = p;
1404 		} else {
1405 			break;
1406 		}
1407 	}
1408 }
1409 
1410 static int cake_advance_shaper(struct cake_sched_data *q,
1411 			       struct cake_tin_data *b,
1412 			       struct sk_buff *skb,
1413 			       ktime_t now, bool drop)
1414 {
1415 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1416 
1417 	/* charge packet bandwidth to this tin
1418 	 * and to the global shaper.
1419 	 */
1420 	if (q->rate_ns) {
1421 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1422 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1423 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1424 
1425 		if (ktime_before(b->time_next_packet, now))
1426 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1427 							   tin_dur);
1428 
1429 		else if (ktime_before(b->time_next_packet,
1430 				      ktime_add_ns(now, tin_dur)))
1431 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1432 
1433 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1434 						   global_dur);
1435 		if (!drop)
1436 			q->failsafe_next_packet = \
1437 				ktime_add_ns(q->failsafe_next_packet,
1438 					     failsafe_dur);
1439 	}
1440 	return len;
1441 }
1442 
1443 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1444 {
1445 	struct cake_sched_data *q = qdisc_priv(sch);
1446 	ktime_t now = ktime_get();
1447 	u32 idx = 0, tin = 0, len;
1448 	struct cake_heap_entry qq;
1449 	struct cake_tin_data *b;
1450 	struct cake_flow *flow;
1451 	struct sk_buff *skb;
1452 
1453 	if (!q->overflow_timeout) {
1454 		int i;
1455 		/* Build fresh max-heap */
1456 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1457 			cake_heapify(q, i);
1458 	}
1459 	q->overflow_timeout = 65535;
1460 
1461 	/* select longest queue for pruning */
1462 	qq  = q->overflow_heap[0];
1463 	tin = qq.t;
1464 	idx = qq.b;
1465 
1466 	b = &q->tins[tin];
1467 	flow = &b->flows[idx];
1468 	skb = dequeue_head(flow);
1469 	if (unlikely(!skb)) {
1470 		/* heap has gone wrong, rebuild it next time */
1471 		q->overflow_timeout = 0;
1472 		return idx + (tin << 16);
1473 	}
1474 
1475 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1476 		b->unresponsive_flow_count++;
1477 
1478 	len = qdisc_pkt_len(skb);
1479 	q->buffer_used      -= skb->truesize;
1480 	b->backlogs[idx]    -= len;
1481 	b->tin_backlog      -= len;
1482 	sch->qstats.backlog -= len;
1483 	qdisc_tree_reduce_backlog(sch, 1, len);
1484 
1485 	flow->dropped++;
1486 	b->tin_dropped++;
1487 	sch->qstats.drops++;
1488 
1489 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1490 		cake_advance_shaper(q, b, skb, now, true);
1491 
1492 	__qdisc_drop(skb, to_free);
1493 	sch->q.qlen--;
1494 
1495 	cake_heapify(q, 0);
1496 
1497 	return idx + (tin << 16);
1498 }
1499 
1500 static void cake_wash_diffserv(struct sk_buff *skb)
1501 {
1502 	switch (skb->protocol) {
1503 	case htons(ETH_P_IP):
1504 		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1505 		break;
1506 	case htons(ETH_P_IPV6):
1507 		ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1508 		break;
1509 	default:
1510 		break;
1511 	}
1512 }
1513 
1514 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1515 {
1516 	u8 dscp;
1517 
1518 	switch (skb->protocol) {
1519 	case htons(ETH_P_IP):
1520 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1521 		if (wash && dscp)
1522 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1523 		return dscp;
1524 
1525 	case htons(ETH_P_IPV6):
1526 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1527 		if (wash && dscp)
1528 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1529 		return dscp;
1530 
1531 	case htons(ETH_P_ARP):
1532 		return 0x38;  /* CS7 - Net Control */
1533 
1534 	default:
1535 		/* If there is no Diffserv field, treat as best-effort */
1536 		return 0;
1537 	}
1538 }
1539 
1540 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1541 					     struct sk_buff *skb)
1542 {
1543 	struct cake_sched_data *q = qdisc_priv(sch);
1544 	u32 tin;
1545 
1546 	if (TC_H_MAJ(skb->priority) == sch->handle &&
1547 	    TC_H_MIN(skb->priority) > 0 &&
1548 	    TC_H_MIN(skb->priority) <= q->tin_cnt) {
1549 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1550 
1551 		if (q->rate_flags & CAKE_FLAG_WASH)
1552 			cake_wash_diffserv(skb);
1553 	} else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1554 		/* extract the Diffserv Precedence field, if it exists */
1555 		/* and clear DSCP bits if washing */
1556 		tin = q->tin_index[cake_handle_diffserv(skb,
1557 				q->rate_flags & CAKE_FLAG_WASH)];
1558 		if (unlikely(tin >= q->tin_cnt))
1559 			tin = 0;
1560 	} else {
1561 		tin = 0;
1562 		if (q->rate_flags & CAKE_FLAG_WASH)
1563 			cake_wash_diffserv(skb);
1564 	}
1565 
1566 	return &q->tins[tin];
1567 }
1568 
1569 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1570 			 struct sk_buff *skb, int flow_mode, int *qerr)
1571 {
1572 	struct cake_sched_data *q = qdisc_priv(sch);
1573 	struct tcf_proto *filter;
1574 	struct tcf_result res;
1575 	u32 flow = 0;
1576 	int result;
1577 
1578 	filter = rcu_dereference_bh(q->filter_list);
1579 	if (!filter)
1580 		goto hash;
1581 
1582 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1583 	result = tcf_classify(skb, filter, &res, false);
1584 
1585 	if (result >= 0) {
1586 #ifdef CONFIG_NET_CLS_ACT
1587 		switch (result) {
1588 		case TC_ACT_STOLEN:
1589 		case TC_ACT_QUEUED:
1590 		case TC_ACT_TRAP:
1591 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1592 			/* fall through */
1593 		case TC_ACT_SHOT:
1594 			return 0;
1595 		}
1596 #endif
1597 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1598 			flow = TC_H_MIN(res.classid);
1599 	}
1600 hash:
1601 	*t = cake_select_tin(sch, skb);
1602 	return flow ?: cake_hash(*t, skb, flow_mode) + 1;
1603 }
1604 
1605 static void cake_reconfigure(struct Qdisc *sch);
1606 
1607 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1608 			struct sk_buff **to_free)
1609 {
1610 	struct cake_sched_data *q = qdisc_priv(sch);
1611 	int len = qdisc_pkt_len(skb);
1612 	int uninitialized_var(ret);
1613 	struct sk_buff *ack = NULL;
1614 	ktime_t now = ktime_get();
1615 	struct cake_tin_data *b;
1616 	struct cake_flow *flow;
1617 	u32 idx;
1618 
1619 	/* choose flow to insert into */
1620 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1621 	if (idx == 0) {
1622 		if (ret & __NET_XMIT_BYPASS)
1623 			qdisc_qstats_drop(sch);
1624 		__qdisc_drop(skb, to_free);
1625 		return ret;
1626 	}
1627 	idx--;
1628 	flow = &b->flows[idx];
1629 
1630 	/* ensure shaper state isn't stale */
1631 	if (!b->tin_backlog) {
1632 		if (ktime_before(b->time_next_packet, now))
1633 			b->time_next_packet = now;
1634 
1635 		if (!sch->q.qlen) {
1636 			if (ktime_before(q->time_next_packet, now)) {
1637 				q->failsafe_next_packet = now;
1638 				q->time_next_packet = now;
1639 			} else if (ktime_after(q->time_next_packet, now) &&
1640 				   ktime_after(q->failsafe_next_packet, now)) {
1641 				u64 next = \
1642 					min(ktime_to_ns(q->time_next_packet),
1643 					    ktime_to_ns(
1644 						   q->failsafe_next_packet));
1645 				sch->qstats.overlimits++;
1646 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1647 			}
1648 		}
1649 	}
1650 
1651 	if (unlikely(len > b->max_skblen))
1652 		b->max_skblen = len;
1653 
1654 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1655 		struct sk_buff *segs, *nskb;
1656 		netdev_features_t features = netif_skb_features(skb);
1657 		unsigned int slen = 0;
1658 
1659 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1660 		if (IS_ERR_OR_NULL(segs))
1661 			return qdisc_drop(skb, sch, to_free);
1662 
1663 		while (segs) {
1664 			nskb = segs->next;
1665 			segs->next = NULL;
1666 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1667 			cobalt_set_enqueue_time(segs, now);
1668 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1669 									  segs);
1670 			flow_queue_add(flow, segs);
1671 
1672 			sch->q.qlen++;
1673 			slen += segs->len;
1674 			q->buffer_used += segs->truesize;
1675 			b->packets++;
1676 			segs = nskb;
1677 		}
1678 
1679 		/* stats */
1680 		b->bytes	    += slen;
1681 		b->backlogs[idx]    += slen;
1682 		b->tin_backlog      += slen;
1683 		sch->qstats.backlog += slen;
1684 		q->avg_window_bytes += slen;
1685 
1686 		qdisc_tree_reduce_backlog(sch, 1, len);
1687 		consume_skb(skb);
1688 	} else {
1689 		/* not splitting */
1690 		cobalt_set_enqueue_time(skb, now);
1691 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1692 		flow_queue_add(flow, skb);
1693 
1694 		if (q->ack_filter)
1695 			ack = cake_ack_filter(q, flow);
1696 
1697 		if (ack) {
1698 			b->ack_drops++;
1699 			sch->qstats.drops++;
1700 			b->bytes += qdisc_pkt_len(ack);
1701 			len -= qdisc_pkt_len(ack);
1702 			q->buffer_used += skb->truesize - ack->truesize;
1703 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1704 				cake_advance_shaper(q, b, ack, now, true);
1705 
1706 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1707 			consume_skb(ack);
1708 		} else {
1709 			sch->q.qlen++;
1710 			q->buffer_used      += skb->truesize;
1711 		}
1712 
1713 		/* stats */
1714 		b->packets++;
1715 		b->bytes	    += len;
1716 		b->backlogs[idx]    += len;
1717 		b->tin_backlog      += len;
1718 		sch->qstats.backlog += len;
1719 		q->avg_window_bytes += len;
1720 	}
1721 
1722 	if (q->overflow_timeout)
1723 		cake_heapify_up(q, b->overflow_idx[idx]);
1724 
1725 	/* incoming bandwidth capacity estimate */
1726 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1727 		u64 packet_interval = \
1728 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1729 
1730 		if (packet_interval > NSEC_PER_SEC)
1731 			packet_interval = NSEC_PER_SEC;
1732 
1733 		/* filter out short-term bursts, eg. wifi aggregation */
1734 		q->avg_packet_interval = \
1735 			cake_ewma(q->avg_packet_interval,
1736 				  packet_interval,
1737 				  (packet_interval > q->avg_packet_interval ?
1738 					  2 : 8));
1739 
1740 		q->last_packet_time = now;
1741 
1742 		if (packet_interval > q->avg_packet_interval) {
1743 			u64 window_interval = \
1744 				ktime_to_ns(ktime_sub(now,
1745 						      q->avg_window_begin));
1746 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1747 
1748 			do_div(b, window_interval);
1749 			q->avg_peak_bandwidth =
1750 				cake_ewma(q->avg_peak_bandwidth, b,
1751 					  b > q->avg_peak_bandwidth ? 2 : 8);
1752 			q->avg_window_bytes = 0;
1753 			q->avg_window_begin = now;
1754 
1755 			if (ktime_after(now,
1756 					ktime_add_ms(q->last_reconfig_time,
1757 						     250))) {
1758 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1759 				cake_reconfigure(sch);
1760 			}
1761 		}
1762 	} else {
1763 		q->avg_window_bytes = 0;
1764 		q->last_packet_time = now;
1765 	}
1766 
1767 	/* flowchain */
1768 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1769 		struct cake_host *srchost = &b->hosts[flow->srchost];
1770 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1771 		u16 host_load = 1;
1772 
1773 		if (!flow->set) {
1774 			list_add_tail(&flow->flowchain, &b->new_flows);
1775 		} else {
1776 			b->decaying_flow_count--;
1777 			list_move_tail(&flow->flowchain, &b->new_flows);
1778 		}
1779 		flow->set = CAKE_SET_SPARSE;
1780 		b->sparse_flow_count++;
1781 
1782 		if (cake_dsrc(q->flow_mode))
1783 			host_load = max(host_load, srchost->srchost_refcnt);
1784 
1785 		if (cake_ddst(q->flow_mode))
1786 			host_load = max(host_load, dsthost->dsthost_refcnt);
1787 
1788 		flow->deficit = (b->flow_quantum *
1789 				 quantum_div[host_load]) >> 16;
1790 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1791 		/* this flow was empty, accounted as a sparse flow, but actually
1792 		 * in the bulk rotation.
1793 		 */
1794 		flow->set = CAKE_SET_BULK;
1795 		b->sparse_flow_count--;
1796 		b->bulk_flow_count++;
1797 	}
1798 
1799 	if (q->buffer_used > q->buffer_max_used)
1800 		q->buffer_max_used = q->buffer_used;
1801 
1802 	if (q->buffer_used > q->buffer_limit) {
1803 		u32 dropped = 0;
1804 
1805 		while (q->buffer_used > q->buffer_limit) {
1806 			dropped++;
1807 			cake_drop(sch, to_free);
1808 		}
1809 		b->drop_overlimit += dropped;
1810 	}
1811 	return NET_XMIT_SUCCESS;
1812 }
1813 
1814 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1815 {
1816 	struct cake_sched_data *q = qdisc_priv(sch);
1817 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1818 	struct cake_flow *flow = &b->flows[q->cur_flow];
1819 	struct sk_buff *skb = NULL;
1820 	u32 len;
1821 
1822 	if (flow->head) {
1823 		skb = dequeue_head(flow);
1824 		len = qdisc_pkt_len(skb);
1825 		b->backlogs[q->cur_flow] -= len;
1826 		b->tin_backlog		 -= len;
1827 		sch->qstats.backlog      -= len;
1828 		q->buffer_used		 -= skb->truesize;
1829 		sch->q.qlen--;
1830 
1831 		if (q->overflow_timeout)
1832 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1833 	}
1834 	return skb;
1835 }
1836 
1837 /* Discard leftover packets from a tin no longer in use. */
1838 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1839 {
1840 	struct cake_sched_data *q = qdisc_priv(sch);
1841 	struct sk_buff *skb;
1842 
1843 	q->cur_tin = tin;
1844 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1845 		while (!!(skb = cake_dequeue_one(sch)))
1846 			kfree_skb(skb);
1847 }
1848 
1849 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1850 {
1851 	struct cake_sched_data *q = qdisc_priv(sch);
1852 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1853 	struct cake_host *srchost, *dsthost;
1854 	ktime_t now = ktime_get();
1855 	struct cake_flow *flow;
1856 	struct list_head *head;
1857 	bool first_flow = true;
1858 	struct sk_buff *skb;
1859 	u16 host_load;
1860 	u64 delay;
1861 	u32 len;
1862 
1863 begin:
1864 	if (!sch->q.qlen)
1865 		return NULL;
1866 
1867 	/* global hard shaper */
1868 	if (ktime_after(q->time_next_packet, now) &&
1869 	    ktime_after(q->failsafe_next_packet, now)) {
1870 		u64 next = min(ktime_to_ns(q->time_next_packet),
1871 			       ktime_to_ns(q->failsafe_next_packet));
1872 
1873 		sch->qstats.overlimits++;
1874 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1875 		return NULL;
1876 	}
1877 
1878 	/* Choose a class to work on. */
1879 	if (!q->rate_ns) {
1880 		/* In unlimited mode, can't rely on shaper timings, just balance
1881 		 * with DRR
1882 		 */
1883 		bool wrapped = false, empty = true;
1884 
1885 		while (b->tin_deficit < 0 ||
1886 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1887 			if (b->tin_deficit <= 0)
1888 				b->tin_deficit += b->tin_quantum_band;
1889 			if (b->sparse_flow_count + b->bulk_flow_count)
1890 				empty = false;
1891 
1892 			q->cur_tin++;
1893 			b++;
1894 			if (q->cur_tin >= q->tin_cnt) {
1895 				q->cur_tin = 0;
1896 				b = q->tins;
1897 
1898 				if (wrapped) {
1899 					/* It's possible for q->qlen to be
1900 					 * nonzero when we actually have no
1901 					 * packets anywhere.
1902 					 */
1903 					if (empty)
1904 						return NULL;
1905 				} else {
1906 					wrapped = true;
1907 				}
1908 			}
1909 		}
1910 	} else {
1911 		/* In shaped mode, choose:
1912 		 * - Highest-priority tin with queue and meeting schedule, or
1913 		 * - The earliest-scheduled tin with queue.
1914 		 */
1915 		ktime_t best_time = KTIME_MAX;
1916 		int tin, best_tin = 0;
1917 
1918 		for (tin = 0; tin < q->tin_cnt; tin++) {
1919 			b = q->tins + tin;
1920 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1921 				ktime_t time_to_pkt = \
1922 					ktime_sub(b->time_next_packet, now);
1923 
1924 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1925 				    ktime_compare(time_to_pkt,
1926 						  best_time) <= 0) {
1927 					best_time = time_to_pkt;
1928 					best_tin = tin;
1929 				}
1930 			}
1931 		}
1932 
1933 		q->cur_tin = best_tin;
1934 		b = q->tins + best_tin;
1935 
1936 		/* No point in going further if no packets to deliver. */
1937 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1938 			return NULL;
1939 	}
1940 
1941 retry:
1942 	/* service this class */
1943 	head = &b->decaying_flows;
1944 	if (!first_flow || list_empty(head)) {
1945 		head = &b->new_flows;
1946 		if (list_empty(head)) {
1947 			head = &b->old_flows;
1948 			if (unlikely(list_empty(head))) {
1949 				head = &b->decaying_flows;
1950 				if (unlikely(list_empty(head)))
1951 					goto begin;
1952 			}
1953 		}
1954 	}
1955 	flow = list_first_entry(head, struct cake_flow, flowchain);
1956 	q->cur_flow = flow - b->flows;
1957 	first_flow = false;
1958 
1959 	/* triple isolation (modified DRR++) */
1960 	srchost = &b->hosts[flow->srchost];
1961 	dsthost = &b->hosts[flow->dsthost];
1962 	host_load = 1;
1963 
1964 	if (cake_dsrc(q->flow_mode))
1965 		host_load = max(host_load, srchost->srchost_refcnt);
1966 
1967 	if (cake_ddst(q->flow_mode))
1968 		host_load = max(host_load, dsthost->dsthost_refcnt);
1969 
1970 	WARN_ON(host_load > CAKE_QUEUES);
1971 
1972 	/* flow isolation (DRR++) */
1973 	if (flow->deficit <= 0) {
1974 		/* The shifted prandom_u32() is a way to apply dithering to
1975 		 * avoid accumulating roundoff errors
1976 		 */
1977 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1978 				  (prandom_u32() >> 16)) >> 16;
1979 		list_move_tail(&flow->flowchain, &b->old_flows);
1980 
1981 		/* Keep all flows with deficits out of the sparse and decaying
1982 		 * rotations.  No non-empty flow can go into the decaying
1983 		 * rotation, so they can't get deficits
1984 		 */
1985 		if (flow->set == CAKE_SET_SPARSE) {
1986 			if (flow->head) {
1987 				b->sparse_flow_count--;
1988 				b->bulk_flow_count++;
1989 				flow->set = CAKE_SET_BULK;
1990 			} else {
1991 				/* we've moved it to the bulk rotation for
1992 				 * correct deficit accounting but we still want
1993 				 * to count it as a sparse flow, not a bulk one.
1994 				 */
1995 				flow->set = CAKE_SET_SPARSE_WAIT;
1996 			}
1997 		}
1998 		goto retry;
1999 	}
2000 
2001 	/* Retrieve a packet via the AQM */
2002 	while (1) {
2003 		skb = cake_dequeue_one(sch);
2004 		if (!skb) {
2005 			/* this queue was actually empty */
2006 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2007 				b->unresponsive_flow_count--;
2008 
2009 			if (flow->cvars.p_drop || flow->cvars.count ||
2010 			    ktime_before(now, flow->cvars.drop_next)) {
2011 				/* keep in the flowchain until the state has
2012 				 * decayed to rest
2013 				 */
2014 				list_move_tail(&flow->flowchain,
2015 					       &b->decaying_flows);
2016 				if (flow->set == CAKE_SET_BULK) {
2017 					b->bulk_flow_count--;
2018 					b->decaying_flow_count++;
2019 				} else if (flow->set == CAKE_SET_SPARSE ||
2020 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2021 					b->sparse_flow_count--;
2022 					b->decaying_flow_count++;
2023 				}
2024 				flow->set = CAKE_SET_DECAYING;
2025 			} else {
2026 				/* remove empty queue from the flowchain */
2027 				list_del_init(&flow->flowchain);
2028 				if (flow->set == CAKE_SET_SPARSE ||
2029 				    flow->set == CAKE_SET_SPARSE_WAIT)
2030 					b->sparse_flow_count--;
2031 				else if (flow->set == CAKE_SET_BULK)
2032 					b->bulk_flow_count--;
2033 				else
2034 					b->decaying_flow_count--;
2035 
2036 				flow->set = CAKE_SET_NONE;
2037 				srchost->srchost_refcnt--;
2038 				dsthost->dsthost_refcnt--;
2039 			}
2040 			goto begin;
2041 		}
2042 
2043 		/* Last packet in queue may be marked, shouldn't be dropped */
2044 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2045 					(b->bulk_flow_count *
2046 					 !!(q->rate_flags &
2047 					    CAKE_FLAG_INGRESS))) ||
2048 		    !flow->head)
2049 			break;
2050 
2051 		/* drop this packet, get another one */
2052 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2053 			len = cake_advance_shaper(q, b, skb,
2054 						  now, true);
2055 			flow->deficit -= len;
2056 			b->tin_deficit -= len;
2057 		}
2058 		flow->dropped++;
2059 		b->tin_dropped++;
2060 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2061 		qdisc_qstats_drop(sch);
2062 		kfree_skb(skb);
2063 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2064 			goto retry;
2065 	}
2066 
2067 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2068 	qdisc_bstats_update(sch, skb);
2069 
2070 	/* collect delay stats */
2071 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2072 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2073 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2074 				  delay > b->peak_delay ? 2 : 8);
2075 	b->base_delay = cake_ewma(b->base_delay, delay,
2076 				  delay < b->base_delay ? 2 : 8);
2077 
2078 	len = cake_advance_shaper(q, b, skb, now, false);
2079 	flow->deficit -= len;
2080 	b->tin_deficit -= len;
2081 
2082 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2083 		u64 next = min(ktime_to_ns(q->time_next_packet),
2084 			       ktime_to_ns(q->failsafe_next_packet));
2085 
2086 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2087 	} else if (!sch->q.qlen) {
2088 		int i;
2089 
2090 		for (i = 0; i < q->tin_cnt; i++) {
2091 			if (q->tins[i].decaying_flow_count) {
2092 				ktime_t next = \
2093 					ktime_add_ns(now,
2094 						     q->tins[i].cparams.target);
2095 
2096 				qdisc_watchdog_schedule_ns(&q->watchdog,
2097 							   ktime_to_ns(next));
2098 				break;
2099 			}
2100 		}
2101 	}
2102 
2103 	if (q->overflow_timeout)
2104 		q->overflow_timeout--;
2105 
2106 	return skb;
2107 }
2108 
2109 static void cake_reset(struct Qdisc *sch)
2110 {
2111 	u32 c;
2112 
2113 	for (c = 0; c < CAKE_MAX_TINS; c++)
2114 		cake_clear_tin(sch, c);
2115 }
2116 
2117 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2118 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2119 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2120 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2121 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2122 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2123 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2124 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2125 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2126 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2127 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2128 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2129 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2130 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2131 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2132 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2133 };
2134 
2135 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2136 			  u64 target_ns, u64 rtt_est_ns)
2137 {
2138 	/* convert byte-rate into time-per-byte
2139 	 * so it will always unwedge in reasonable time.
2140 	 */
2141 	static const u64 MIN_RATE = 64;
2142 	u32 byte_target = mtu;
2143 	u64 byte_target_ns;
2144 	u8  rate_shft = 0;
2145 	u64 rate_ns = 0;
2146 
2147 	b->flow_quantum = 1514;
2148 	if (rate) {
2149 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2150 		rate_shft = 34;
2151 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2152 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2153 		while (!!(rate_ns >> 34)) {
2154 			rate_ns >>= 1;
2155 			rate_shft--;
2156 		}
2157 	} /* else unlimited, ie. zero delay */
2158 
2159 	b->tin_rate_bps  = rate;
2160 	b->tin_rate_ns   = rate_ns;
2161 	b->tin_rate_shft = rate_shft;
2162 
2163 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2164 
2165 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2166 	b->cparams.interval = max(rtt_est_ns +
2167 				     b->cparams.target - target_ns,
2168 				     b->cparams.target * 2);
2169 	b->cparams.mtu_time = byte_target_ns;
2170 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2171 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2172 }
2173 
2174 static int cake_config_besteffort(struct Qdisc *sch)
2175 {
2176 	struct cake_sched_data *q = qdisc_priv(sch);
2177 	struct cake_tin_data *b = &q->tins[0];
2178 	u32 mtu = psched_mtu(qdisc_dev(sch));
2179 	u64 rate = q->rate_bps;
2180 
2181 	q->tin_cnt = 1;
2182 
2183 	q->tin_index = besteffort;
2184 	q->tin_order = normal_order;
2185 
2186 	cake_set_rate(b, rate, mtu,
2187 		      us_to_ns(q->target), us_to_ns(q->interval));
2188 	b->tin_quantum_band = 65535;
2189 	b->tin_quantum_prio = 65535;
2190 
2191 	return 0;
2192 }
2193 
2194 static int cake_config_precedence(struct Qdisc *sch)
2195 {
2196 	/* convert high-level (user visible) parameters into internal format */
2197 	struct cake_sched_data *q = qdisc_priv(sch);
2198 	u32 mtu = psched_mtu(qdisc_dev(sch));
2199 	u64 rate = q->rate_bps;
2200 	u32 quantum1 = 256;
2201 	u32 quantum2 = 256;
2202 	u32 i;
2203 
2204 	q->tin_cnt = 8;
2205 	q->tin_index = precedence;
2206 	q->tin_order = normal_order;
2207 
2208 	for (i = 0; i < q->tin_cnt; i++) {
2209 		struct cake_tin_data *b = &q->tins[i];
2210 
2211 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2212 			      us_to_ns(q->interval));
2213 
2214 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2215 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2216 
2217 		/* calculate next class's parameters */
2218 		rate  *= 7;
2219 		rate >>= 3;
2220 
2221 		quantum1  *= 3;
2222 		quantum1 >>= 1;
2223 
2224 		quantum2  *= 7;
2225 		quantum2 >>= 3;
2226 	}
2227 
2228 	return 0;
2229 }
2230 
2231 /*	List of known Diffserv codepoints:
2232  *
2233  *	Least Effort (CS1)
2234  *	Best Effort (CS0)
2235  *	Max Reliability & LLT "Lo" (TOS1)
2236  *	Max Throughput (TOS2)
2237  *	Min Delay (TOS4)
2238  *	LLT "La" (TOS5)
2239  *	Assured Forwarding 1 (AF1x) - x3
2240  *	Assured Forwarding 2 (AF2x) - x3
2241  *	Assured Forwarding 3 (AF3x) - x3
2242  *	Assured Forwarding 4 (AF4x) - x3
2243  *	Precedence Class 2 (CS2)
2244  *	Precedence Class 3 (CS3)
2245  *	Precedence Class 4 (CS4)
2246  *	Precedence Class 5 (CS5)
2247  *	Precedence Class 6 (CS6)
2248  *	Precedence Class 7 (CS7)
2249  *	Voice Admit (VA)
2250  *	Expedited Forwarding (EF)
2251 
2252  *	Total 25 codepoints.
2253  */
2254 
2255 /*	List of traffic classes in RFC 4594:
2256  *		(roughly descending order of contended priority)
2257  *		(roughly ascending order of uncontended throughput)
2258  *
2259  *	Network Control (CS6,CS7)      - routing traffic
2260  *	Telephony (EF,VA)         - aka. VoIP streams
2261  *	Signalling (CS5)               - VoIP setup
2262  *	Multimedia Conferencing (AF4x) - aka. video calls
2263  *	Realtime Interactive (CS4)     - eg. games
2264  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2265  *	Broadcast Video (CS3)
2266  *	Low Latency Data (AF2x,TOS4)      - eg. database
2267  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2268  *	Standard Service (CS0 & unrecognised codepoints)
2269  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2270  *	Low Priority Data (CS1)           - eg. BitTorrent
2271 
2272  *	Total 12 traffic classes.
2273  */
2274 
2275 static int cake_config_diffserv8(struct Qdisc *sch)
2276 {
2277 /*	Pruned list of traffic classes for typical applications:
2278  *
2279  *		Network Control          (CS6, CS7)
2280  *		Minimum Latency          (EF, VA, CS5, CS4)
2281  *		Interactive Shell        (CS2, TOS1)
2282  *		Low Latency Transactions (AF2x, TOS4)
2283  *		Video Streaming          (AF4x, AF3x, CS3)
2284  *		Bog Standard             (CS0 etc.)
2285  *		High Throughput          (AF1x, TOS2)
2286  *		Background Traffic       (CS1)
2287  *
2288  *		Total 8 traffic classes.
2289  */
2290 
2291 	struct cake_sched_data *q = qdisc_priv(sch);
2292 	u32 mtu = psched_mtu(qdisc_dev(sch));
2293 	u64 rate = q->rate_bps;
2294 	u32 quantum1 = 256;
2295 	u32 quantum2 = 256;
2296 	u32 i;
2297 
2298 	q->tin_cnt = 8;
2299 
2300 	/* codepoint to class mapping */
2301 	q->tin_index = diffserv8;
2302 	q->tin_order = normal_order;
2303 
2304 	/* class characteristics */
2305 	for (i = 0; i < q->tin_cnt; i++) {
2306 		struct cake_tin_data *b = &q->tins[i];
2307 
2308 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2309 			      us_to_ns(q->interval));
2310 
2311 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2312 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2313 
2314 		/* calculate next class's parameters */
2315 		rate  *= 7;
2316 		rate >>= 3;
2317 
2318 		quantum1  *= 3;
2319 		quantum1 >>= 1;
2320 
2321 		quantum2  *= 7;
2322 		quantum2 >>= 3;
2323 	}
2324 
2325 	return 0;
2326 }
2327 
2328 static int cake_config_diffserv4(struct Qdisc *sch)
2329 {
2330 /*  Further pruned list of traffic classes for four-class system:
2331  *
2332  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2333  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2334  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2335  *	    Background Traffic (CS1)
2336  *
2337  *		Total 4 traffic classes.
2338  */
2339 
2340 	struct cake_sched_data *q = qdisc_priv(sch);
2341 	u32 mtu = psched_mtu(qdisc_dev(sch));
2342 	u64 rate = q->rate_bps;
2343 	u32 quantum = 1024;
2344 
2345 	q->tin_cnt = 4;
2346 
2347 	/* codepoint to class mapping */
2348 	q->tin_index = diffserv4;
2349 	q->tin_order = bulk_order;
2350 
2351 	/* class characteristics */
2352 	cake_set_rate(&q->tins[0], rate, mtu,
2353 		      us_to_ns(q->target), us_to_ns(q->interval));
2354 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2355 		      us_to_ns(q->target), us_to_ns(q->interval));
2356 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2357 		      us_to_ns(q->target), us_to_ns(q->interval));
2358 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2359 		      us_to_ns(q->target), us_to_ns(q->interval));
2360 
2361 	/* priority weights */
2362 	q->tins[0].tin_quantum_prio = quantum;
2363 	q->tins[1].tin_quantum_prio = quantum >> 4;
2364 	q->tins[2].tin_quantum_prio = quantum << 2;
2365 	q->tins[3].tin_quantum_prio = quantum << 4;
2366 
2367 	/* bandwidth-sharing weights */
2368 	q->tins[0].tin_quantum_band = quantum;
2369 	q->tins[1].tin_quantum_band = quantum >> 4;
2370 	q->tins[2].tin_quantum_band = quantum >> 1;
2371 	q->tins[3].tin_quantum_band = quantum >> 2;
2372 
2373 	return 0;
2374 }
2375 
2376 static int cake_config_diffserv3(struct Qdisc *sch)
2377 {
2378 /*  Simplified Diffserv structure with 3 tins.
2379  *		Low Priority		(CS1)
2380  *		Best Effort
2381  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2382  */
2383 	struct cake_sched_data *q = qdisc_priv(sch);
2384 	u32 mtu = psched_mtu(qdisc_dev(sch));
2385 	u64 rate = q->rate_bps;
2386 	u32 quantum = 1024;
2387 
2388 	q->tin_cnt = 3;
2389 
2390 	/* codepoint to class mapping */
2391 	q->tin_index = diffserv3;
2392 	q->tin_order = bulk_order;
2393 
2394 	/* class characteristics */
2395 	cake_set_rate(&q->tins[0], rate, mtu,
2396 		      us_to_ns(q->target), us_to_ns(q->interval));
2397 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2398 		      us_to_ns(q->target), us_to_ns(q->interval));
2399 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2400 		      us_to_ns(q->target), us_to_ns(q->interval));
2401 
2402 	/* priority weights */
2403 	q->tins[0].tin_quantum_prio = quantum;
2404 	q->tins[1].tin_quantum_prio = quantum >> 4;
2405 	q->tins[2].tin_quantum_prio = quantum << 4;
2406 
2407 	/* bandwidth-sharing weights */
2408 	q->tins[0].tin_quantum_band = quantum;
2409 	q->tins[1].tin_quantum_band = quantum >> 4;
2410 	q->tins[2].tin_quantum_band = quantum >> 2;
2411 
2412 	return 0;
2413 }
2414 
2415 static void cake_reconfigure(struct Qdisc *sch)
2416 {
2417 	struct cake_sched_data *q = qdisc_priv(sch);
2418 	int c, ft;
2419 
2420 	switch (q->tin_mode) {
2421 	case CAKE_DIFFSERV_BESTEFFORT:
2422 		ft = cake_config_besteffort(sch);
2423 		break;
2424 
2425 	case CAKE_DIFFSERV_PRECEDENCE:
2426 		ft = cake_config_precedence(sch);
2427 		break;
2428 
2429 	case CAKE_DIFFSERV_DIFFSERV8:
2430 		ft = cake_config_diffserv8(sch);
2431 		break;
2432 
2433 	case CAKE_DIFFSERV_DIFFSERV4:
2434 		ft = cake_config_diffserv4(sch);
2435 		break;
2436 
2437 	case CAKE_DIFFSERV_DIFFSERV3:
2438 	default:
2439 		ft = cake_config_diffserv3(sch);
2440 		break;
2441 	}
2442 
2443 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2444 		cake_clear_tin(sch, c);
2445 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2446 	}
2447 
2448 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2449 	q->rate_shft = q->tins[ft].tin_rate_shft;
2450 
2451 	if (q->buffer_config_limit) {
2452 		q->buffer_limit = q->buffer_config_limit;
2453 	} else if (q->rate_bps) {
2454 		u64 t = q->rate_bps * q->interval;
2455 
2456 		do_div(t, USEC_PER_SEC / 4);
2457 		q->buffer_limit = max_t(u32, t, 4U << 20);
2458 	} else {
2459 		q->buffer_limit = ~0;
2460 	}
2461 
2462 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2463 
2464 	q->buffer_limit = min(q->buffer_limit,
2465 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2466 				  q->buffer_config_limit));
2467 }
2468 
2469 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2470 		       struct netlink_ext_ack *extack)
2471 {
2472 	struct cake_sched_data *q = qdisc_priv(sch);
2473 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2474 	int err;
2475 
2476 	if (!opt)
2477 		return -EINVAL;
2478 
2479 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2480 	if (err < 0)
2481 		return err;
2482 
2483 	if (tb[TCA_CAKE_NAT]) {
2484 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2485 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2486 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2487 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2488 #else
2489 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2490 				    "No conntrack support in kernel");
2491 		return -EOPNOTSUPP;
2492 #endif
2493 	}
2494 
2495 	if (tb[TCA_CAKE_BASE_RATE64])
2496 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2497 
2498 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2499 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2500 
2501 	if (tb[TCA_CAKE_WASH]) {
2502 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2503 			q->rate_flags |= CAKE_FLAG_WASH;
2504 		else
2505 			q->rate_flags &= ~CAKE_FLAG_WASH;
2506 	}
2507 
2508 	if (tb[TCA_CAKE_FLOW_MODE])
2509 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2510 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2511 					CAKE_FLOW_MASK));
2512 
2513 	if (tb[TCA_CAKE_ATM])
2514 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2515 
2516 	if (tb[TCA_CAKE_OVERHEAD]) {
2517 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2518 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2519 
2520 		q->max_netlen = 0;
2521 		q->max_adjlen = 0;
2522 		q->min_netlen = ~0;
2523 		q->min_adjlen = ~0;
2524 	}
2525 
2526 	if (tb[TCA_CAKE_RAW]) {
2527 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2528 
2529 		q->max_netlen = 0;
2530 		q->max_adjlen = 0;
2531 		q->min_netlen = ~0;
2532 		q->min_adjlen = ~0;
2533 	}
2534 
2535 	if (tb[TCA_CAKE_MPU])
2536 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2537 
2538 	if (tb[TCA_CAKE_RTT]) {
2539 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2540 
2541 		if (!q->interval)
2542 			q->interval = 1;
2543 	}
2544 
2545 	if (tb[TCA_CAKE_TARGET]) {
2546 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2547 
2548 		if (!q->target)
2549 			q->target = 1;
2550 	}
2551 
2552 	if (tb[TCA_CAKE_AUTORATE]) {
2553 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2554 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2555 		else
2556 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2557 	}
2558 
2559 	if (tb[TCA_CAKE_INGRESS]) {
2560 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2561 			q->rate_flags |= CAKE_FLAG_INGRESS;
2562 		else
2563 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2564 	}
2565 
2566 	if (tb[TCA_CAKE_ACK_FILTER])
2567 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2568 
2569 	if (tb[TCA_CAKE_MEMORY])
2570 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2571 
2572 	if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD)
2573 		q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2574 	else
2575 		q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2576 
2577 	if (q->tins) {
2578 		sch_tree_lock(sch);
2579 		cake_reconfigure(sch);
2580 		sch_tree_unlock(sch);
2581 	}
2582 
2583 	return 0;
2584 }
2585 
2586 static void cake_destroy(struct Qdisc *sch)
2587 {
2588 	struct cake_sched_data *q = qdisc_priv(sch);
2589 
2590 	qdisc_watchdog_cancel(&q->watchdog);
2591 	tcf_block_put(q->block);
2592 	kvfree(q->tins);
2593 }
2594 
2595 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2596 		     struct netlink_ext_ack *extack)
2597 {
2598 	struct cake_sched_data *q = qdisc_priv(sch);
2599 	int i, j, err;
2600 
2601 	sch->limit = 10240;
2602 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2603 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2604 
2605 	q->rate_bps = 0; /* unlimited by default */
2606 
2607 	q->interval = 100000; /* 100ms default */
2608 	q->target   =   5000; /* 5ms: codel RFC argues
2609 			       * for 5 to 10% of interval
2610 			       */
2611 
2612 	q->cur_tin = 0;
2613 	q->cur_flow  = 0;
2614 
2615 	qdisc_watchdog_init(&q->watchdog, sch);
2616 
2617 	if (opt) {
2618 		int err = cake_change(sch, opt, extack);
2619 
2620 		if (err)
2621 			return err;
2622 	}
2623 
2624 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2625 	if (err)
2626 		return err;
2627 
2628 	quantum_div[0] = ~0;
2629 	for (i = 1; i <= CAKE_QUEUES; i++)
2630 		quantum_div[i] = 65535 / i;
2631 
2632 	q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
2633 			   GFP_KERNEL);
2634 	if (!q->tins)
2635 		goto nomem;
2636 
2637 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2638 		struct cake_tin_data *b = q->tins + i;
2639 
2640 		INIT_LIST_HEAD(&b->new_flows);
2641 		INIT_LIST_HEAD(&b->old_flows);
2642 		INIT_LIST_HEAD(&b->decaying_flows);
2643 		b->sparse_flow_count = 0;
2644 		b->bulk_flow_count = 0;
2645 		b->decaying_flow_count = 0;
2646 
2647 		for (j = 0; j < CAKE_QUEUES; j++) {
2648 			struct cake_flow *flow = b->flows + j;
2649 			u32 k = j * CAKE_MAX_TINS + i;
2650 
2651 			INIT_LIST_HEAD(&flow->flowchain);
2652 			cobalt_vars_init(&flow->cvars);
2653 
2654 			q->overflow_heap[k].t = i;
2655 			q->overflow_heap[k].b = j;
2656 			b->overflow_idx[j] = k;
2657 		}
2658 	}
2659 
2660 	cake_reconfigure(sch);
2661 	q->avg_peak_bandwidth = q->rate_bps;
2662 	q->min_netlen = ~0;
2663 	q->min_adjlen = ~0;
2664 	return 0;
2665 
2666 nomem:
2667 	cake_destroy(sch);
2668 	return -ENOMEM;
2669 }
2670 
2671 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2672 {
2673 	struct cake_sched_data *q = qdisc_priv(sch);
2674 	struct nlattr *opts;
2675 
2676 	opts = nla_nest_start(skb, TCA_OPTIONS);
2677 	if (!opts)
2678 		goto nla_put_failure;
2679 
2680 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2681 			      TCA_CAKE_PAD))
2682 		goto nla_put_failure;
2683 
2684 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2685 			q->flow_mode & CAKE_FLOW_MASK))
2686 		goto nla_put_failure;
2687 
2688 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2689 		goto nla_put_failure;
2690 
2691 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2692 		goto nla_put_failure;
2693 
2694 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2695 		goto nla_put_failure;
2696 
2697 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2698 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2699 		goto nla_put_failure;
2700 
2701 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2702 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2703 		goto nla_put_failure;
2704 
2705 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2706 		goto nla_put_failure;
2707 
2708 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2709 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2710 		goto nla_put_failure;
2711 
2712 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2713 		goto nla_put_failure;
2714 
2715 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2716 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2717 		goto nla_put_failure;
2718 
2719 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2720 		goto nla_put_failure;
2721 
2722 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2723 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2724 			goto nla_put_failure;
2725 
2726 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2727 		goto nla_put_failure;
2728 
2729 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2730 		goto nla_put_failure;
2731 
2732 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2733 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2734 		goto nla_put_failure;
2735 
2736 	return nla_nest_end(skb, opts);
2737 
2738 nla_put_failure:
2739 	return -1;
2740 }
2741 
2742 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2743 {
2744 	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2745 	struct cake_sched_data *q = qdisc_priv(sch);
2746 	struct nlattr *tstats, *ts;
2747 	int i;
2748 
2749 	if (!stats)
2750 		return -1;
2751 
2752 #define PUT_STAT_U32(attr, data) do {				       \
2753 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2754 			goto nla_put_failure;			       \
2755 	} while (0)
2756 #define PUT_STAT_U64(attr, data) do {				       \
2757 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2758 					data, TCA_CAKE_STATS_PAD)) \
2759 			goto nla_put_failure;			       \
2760 	} while (0)
2761 
2762 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2763 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2764 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2765 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2766 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2767 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2768 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2769 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2770 
2771 #undef PUT_STAT_U32
2772 #undef PUT_STAT_U64
2773 
2774 	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2775 	if (!tstats)
2776 		goto nla_put_failure;
2777 
2778 #define PUT_TSTAT_U32(attr, data) do {					\
2779 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2780 			goto nla_put_failure;				\
2781 	} while (0)
2782 #define PUT_TSTAT_U64(attr, data) do {					\
2783 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2784 					data, TCA_CAKE_TIN_STATS_PAD))	\
2785 			goto nla_put_failure;				\
2786 	} while (0)
2787 
2788 	for (i = 0; i < q->tin_cnt; i++) {
2789 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2790 
2791 		ts = nla_nest_start(d->skb, i + 1);
2792 		if (!ts)
2793 			goto nla_put_failure;
2794 
2795 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2796 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2797 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2798 
2799 		PUT_TSTAT_U32(TARGET_US,
2800 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2801 		PUT_TSTAT_U32(INTERVAL_US,
2802 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2803 
2804 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2805 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2806 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2807 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2808 
2809 		PUT_TSTAT_U32(PEAK_DELAY_US,
2810 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2811 		PUT_TSTAT_U32(AVG_DELAY_US,
2812 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2813 		PUT_TSTAT_U32(BASE_DELAY_US,
2814 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2815 
2816 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2817 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2818 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2819 
2820 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2821 					    b->decaying_flow_count);
2822 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2823 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2824 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2825 
2826 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2827 		nla_nest_end(d->skb, ts);
2828 	}
2829 
2830 #undef PUT_TSTAT_U32
2831 #undef PUT_TSTAT_U64
2832 
2833 	nla_nest_end(d->skb, tstats);
2834 	return nla_nest_end(d->skb, stats);
2835 
2836 nla_put_failure:
2837 	nla_nest_cancel(d->skb, stats);
2838 	return -1;
2839 }
2840 
2841 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2842 {
2843 	return NULL;
2844 }
2845 
2846 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2847 {
2848 	return 0;
2849 }
2850 
2851 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2852 			       u32 classid)
2853 {
2854 	return 0;
2855 }
2856 
2857 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2858 {
2859 }
2860 
2861 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2862 					struct netlink_ext_ack *extack)
2863 {
2864 	struct cake_sched_data *q = qdisc_priv(sch);
2865 
2866 	if (cl)
2867 		return NULL;
2868 	return q->block;
2869 }
2870 
2871 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2872 			   struct sk_buff *skb, struct tcmsg *tcm)
2873 {
2874 	tcm->tcm_handle |= TC_H_MIN(cl);
2875 	return 0;
2876 }
2877 
2878 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2879 				 struct gnet_dump *d)
2880 {
2881 	struct cake_sched_data *q = qdisc_priv(sch);
2882 	const struct cake_flow *flow = NULL;
2883 	struct gnet_stats_queue qs = { 0 };
2884 	struct nlattr *stats;
2885 	u32 idx = cl - 1;
2886 
2887 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2888 		const struct cake_tin_data *b = \
2889 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2890 		const struct sk_buff *skb;
2891 
2892 		flow = &b->flows[idx % CAKE_QUEUES];
2893 
2894 		if (flow->head) {
2895 			sch_tree_lock(sch);
2896 			skb = flow->head;
2897 			while (skb) {
2898 				qs.qlen++;
2899 				skb = skb->next;
2900 			}
2901 			sch_tree_unlock(sch);
2902 		}
2903 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2904 		qs.drops = flow->dropped;
2905 	}
2906 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2907 		return -1;
2908 	if (flow) {
2909 		ktime_t now = ktime_get();
2910 
2911 		stats = nla_nest_start(d->skb, TCA_STATS_APP);
2912 		if (!stats)
2913 			return -1;
2914 
2915 #define PUT_STAT_U32(attr, data) do {				       \
2916 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2917 			goto nla_put_failure;			       \
2918 	} while (0)
2919 #define PUT_STAT_S32(attr, data) do {				       \
2920 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2921 			goto nla_put_failure;			       \
2922 	} while (0)
2923 
2924 		PUT_STAT_S32(DEFICIT, flow->deficit);
2925 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2926 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2927 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2928 		if (flow->cvars.p_drop) {
2929 			PUT_STAT_S32(BLUE_TIMER_US,
2930 				     ktime_to_us(
2931 					     ktime_sub(now,
2932 						     flow->cvars.blue_timer)));
2933 		}
2934 		if (flow->cvars.dropping) {
2935 			PUT_STAT_S32(DROP_NEXT_US,
2936 				     ktime_to_us(
2937 					     ktime_sub(now,
2938 						       flow->cvars.drop_next)));
2939 		}
2940 
2941 		if (nla_nest_end(d->skb, stats) < 0)
2942 			return -1;
2943 	}
2944 
2945 	return 0;
2946 
2947 nla_put_failure:
2948 	nla_nest_cancel(d->skb, stats);
2949 	return -1;
2950 }
2951 
2952 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2953 {
2954 	struct cake_sched_data *q = qdisc_priv(sch);
2955 	unsigned int i, j;
2956 
2957 	if (arg->stop)
2958 		return;
2959 
2960 	for (i = 0; i < q->tin_cnt; i++) {
2961 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2962 
2963 		for (j = 0; j < CAKE_QUEUES; j++) {
2964 			if (list_empty(&b->flows[j].flowchain) ||
2965 			    arg->count < arg->skip) {
2966 				arg->count++;
2967 				continue;
2968 			}
2969 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2970 				arg->stop = 1;
2971 				break;
2972 			}
2973 			arg->count++;
2974 		}
2975 	}
2976 }
2977 
2978 static const struct Qdisc_class_ops cake_class_ops = {
2979 	.leaf		=	cake_leaf,
2980 	.find		=	cake_find,
2981 	.tcf_block	=	cake_tcf_block,
2982 	.bind_tcf	=	cake_bind,
2983 	.unbind_tcf	=	cake_unbind,
2984 	.dump		=	cake_dump_class,
2985 	.dump_stats	=	cake_dump_class_stats,
2986 	.walk		=	cake_walk,
2987 };
2988 
2989 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
2990 	.cl_ops		=	&cake_class_ops,
2991 	.id		=	"cake",
2992 	.priv_size	=	sizeof(struct cake_sched_data),
2993 	.enqueue	=	cake_enqueue,
2994 	.dequeue	=	cake_dequeue,
2995 	.peek		=	qdisc_peek_dequeued,
2996 	.init		=	cake_init,
2997 	.reset		=	cake_reset,
2998 	.destroy	=	cake_destroy,
2999 	.change		=	cake_change,
3000 	.dump		=	cake_dump,
3001 	.dump_stats	=	cake_dump_stats,
3002 	.owner		=	THIS_MODULE,
3003 };
3004 
3005 static int __init cake_module_init(void)
3006 {
3007 	return register_qdisc(&cake_qdisc_ops);
3008 }
3009 
3010 static void __exit cake_module_exit(void)
3011 {
3012 	unregister_qdisc(&cake_qdisc_ops);
3013 }
3014 
3015 module_init(cake_module_init)
3016 module_exit(cake_module_exit)
3017 MODULE_AUTHOR("Jonathan Morton");
3018 MODULE_LICENSE("Dual BSD/GPL");
3019 MODULE_DESCRIPTION("The CAKE shaper.");
3020