xref: /openbmc/linux/net/sched/sch_cake.c (revision d32fd6bb9f2bc8178cdd65ebec1ad670a8bfa241)
1  // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2  
3  /* COMMON Applications Kept Enhanced (CAKE) discipline
4   *
5   * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6   * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7   * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8   * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9   * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10   * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11   *
12   * The CAKE Principles:
13   *		   (or, how to have your cake and eat it too)
14   *
15   * This is a combination of several shaping, AQM and FQ techniques into one
16   * easy-to-use package:
17   *
18   * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19   *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20   *   eliminating the need for any sort of burst parameter (eg. token bucket
21   *   depth).  Burst support is limited to that necessary to overcome scheduling
22   *   latency.
23   *
24   * - A Diffserv-aware priority queue, giving more priority to certain classes,
25   *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26   *   the priority is reduced to avoid starving other tins.
27   *
28   * - Each priority tin has a separate Flow Queue system, to isolate traffic
29   *   flows from each other.  This prevents a burst on one flow from increasing
30   *   the delay to another.  Flows are distributed to queues using a
31   *   set-associative hash function.
32   *
33   * - Each queue is actively managed by Cobalt, which is a combination of the
34   *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35   *   congestion early via ECN (if available) and/or packet drops, to keep
36   *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37   *   setting, as is necessary at low bandwidths.
38   *
39   * The configuration parameters are kept deliberately simple for ease of use.
40   * Everything has sane defaults.  Complete generality of configuration is *not*
41   * a goal.
42   *
43   * The priority queue operates according to a weighted DRR scheme, combined with
44   * a bandwidth tracker which reuses the shaper logic to detect which side of the
45   * bandwidth sharing threshold the tin is operating.  This determines whether a
46   * priority-based weight (high) or a bandwidth-based weight (low) is used for
47   * that tin in the current pass.
48   *
49   * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50   * granted us permission to leverage.
51   */
52  
53  #include <linux/module.h>
54  #include <linux/types.h>
55  #include <linux/kernel.h>
56  #include <linux/jiffies.h>
57  #include <linux/string.h>
58  #include <linux/in.h>
59  #include <linux/errno.h>
60  #include <linux/init.h>
61  #include <linux/skbuff.h>
62  #include <linux/jhash.h>
63  #include <linux/slab.h>
64  #include <linux/vmalloc.h>
65  #include <linux/reciprocal_div.h>
66  #include <net/netlink.h>
67  #include <linux/if_vlan.h>
68  #include <net/gso.h>
69  #include <net/pkt_sched.h>
70  #include <net/pkt_cls.h>
71  #include <net/tcp.h>
72  #include <net/flow_dissector.h>
73  
74  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75  #include <net/netfilter/nf_conntrack_core.h>
76  #endif
77  
78  #define CAKE_SET_WAYS (8)
79  #define CAKE_MAX_TINS (8)
80  #define CAKE_QUEUES (1024)
81  #define CAKE_FLOW_MASK 63
82  #define CAKE_FLOW_NAT_FLAG 64
83  
84  /* struct cobalt_params - contains codel and blue parameters
85   * @interval:	codel initial drop rate
86   * @target:     maximum persistent sojourn time & blue update rate
87   * @mtu_time:   serialisation delay of maximum-size packet
88   * @p_inc:      increment of blue drop probability (0.32 fxp)
89   * @p_dec:      decrement of blue drop probability (0.32 fxp)
90   */
91  struct cobalt_params {
92  	u64	interval;
93  	u64	target;
94  	u64	mtu_time;
95  	u32	p_inc;
96  	u32	p_dec;
97  };
98  
99  /* struct cobalt_vars - contains codel and blue variables
100   * @count:		codel dropping frequency
101   * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102   * @drop_next:		time to drop next packet, or when we dropped last
103   * @blue_timer:		Blue time to next drop
104   * @p_drop:		BLUE drop probability (0.32 fxp)
105   * @dropping:		set if in dropping state
106   * @ecn_marked:		set if marked
107   */
108  struct cobalt_vars {
109  	u32	count;
110  	u32	rec_inv_sqrt;
111  	ktime_t	drop_next;
112  	ktime_t	blue_timer;
113  	u32     p_drop;
114  	bool	dropping;
115  	bool    ecn_marked;
116  };
117  
118  enum {
119  	CAKE_SET_NONE = 0,
120  	CAKE_SET_SPARSE,
121  	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122  	CAKE_SET_BULK,
123  	CAKE_SET_DECAYING
124  };
125  
126  struct cake_flow {
127  	/* this stuff is all needed per-flow at dequeue time */
128  	struct sk_buff	  *head;
129  	struct sk_buff	  *tail;
130  	struct list_head  flowchain;
131  	s32		  deficit;
132  	u32		  dropped;
133  	struct cobalt_vars cvars;
134  	u16		  srchost; /* index into cake_host table */
135  	u16		  dsthost;
136  	u8		  set;
137  }; /* please try to keep this structure <= 64 bytes */
138  
139  struct cake_host {
140  	u32 srchost_tag;
141  	u32 dsthost_tag;
142  	u16 srchost_bulk_flow_count;
143  	u16 dsthost_bulk_flow_count;
144  };
145  
146  struct cake_heap_entry {
147  	u16 t:3, b:10;
148  };
149  
150  struct cake_tin_data {
151  	struct cake_flow flows[CAKE_QUEUES];
152  	u32	backlogs[CAKE_QUEUES];
153  	u32	tags[CAKE_QUEUES]; /* for set association */
154  	u16	overflow_idx[CAKE_QUEUES];
155  	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156  	u16	flow_quantum;
157  
158  	struct cobalt_params cparams;
159  	u32	drop_overlimit;
160  	u16	bulk_flow_count;
161  	u16	sparse_flow_count;
162  	u16	decaying_flow_count;
163  	u16	unresponsive_flow_count;
164  
165  	u32	max_skblen;
166  
167  	struct list_head new_flows;
168  	struct list_head old_flows;
169  	struct list_head decaying_flows;
170  
171  	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172  	ktime_t	time_next_packet;
173  	u64	tin_rate_ns;
174  	u64	tin_rate_bps;
175  	u16	tin_rate_shft;
176  
177  	u16	tin_quantum;
178  	s32	tin_deficit;
179  	u32	tin_backlog;
180  	u32	tin_dropped;
181  	u32	tin_ecn_mark;
182  
183  	u32	packets;
184  	u64	bytes;
185  
186  	u32	ack_drops;
187  
188  	/* moving averages */
189  	u64 avge_delay;
190  	u64 peak_delay;
191  	u64 base_delay;
192  
193  	/* hash function stats */
194  	u32	way_directs;
195  	u32	way_hits;
196  	u32	way_misses;
197  	u32	way_collisions;
198  }; /* number of tins is small, so size of this struct doesn't matter much */
199  
200  struct cake_sched_data {
201  	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202  	struct tcf_block *block;
203  	struct cake_tin_data *tins;
204  
205  	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206  	u16		overflow_timeout;
207  
208  	u16		tin_cnt;
209  	u8		tin_mode;
210  	u8		flow_mode;
211  	u8		ack_filter;
212  	u8		atm_mode;
213  
214  	u32		fwmark_mask;
215  	u16		fwmark_shft;
216  
217  	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218  	u16		rate_shft;
219  	ktime_t		time_next_packet;
220  	ktime_t		failsafe_next_packet;
221  	u64		rate_ns;
222  	u64		rate_bps;
223  	u16		rate_flags;
224  	s16		rate_overhead;
225  	u16		rate_mpu;
226  	u64		interval;
227  	u64		target;
228  
229  	/* resource tracking */
230  	u32		buffer_used;
231  	u32		buffer_max_used;
232  	u32		buffer_limit;
233  	u32		buffer_config_limit;
234  
235  	/* indices for dequeue */
236  	u16		cur_tin;
237  	u16		cur_flow;
238  
239  	struct qdisc_watchdog watchdog;
240  	const u8	*tin_index;
241  	const u8	*tin_order;
242  
243  	/* bandwidth capacity estimate */
244  	ktime_t		last_packet_time;
245  	ktime_t		avg_window_begin;
246  	u64		avg_packet_interval;
247  	u64		avg_window_bytes;
248  	u64		avg_peak_bandwidth;
249  	ktime_t		last_reconfig_time;
250  
251  	/* packet length stats */
252  	u32		avg_netoff;
253  	u16		max_netlen;
254  	u16		max_adjlen;
255  	u16		min_netlen;
256  	u16		min_adjlen;
257  };
258  
259  enum {
260  	CAKE_FLAG_OVERHEAD	   = BIT(0),
261  	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262  	CAKE_FLAG_INGRESS	   = BIT(2),
263  	CAKE_FLAG_WASH		   = BIT(3),
264  	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265  };
266  
267  /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268   * obtain the best features of each.  Codel is excellent on flows which
269   * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270   * unresponsive flows.
271   */
272  
273  struct cobalt_skb_cb {
274  	ktime_t enqueue_time;
275  	u32     adjusted_len;
276  };
277  
us_to_ns(u64 us)278  static u64 us_to_ns(u64 us)
279  {
280  	return us * NSEC_PER_USEC;
281  }
282  
get_cobalt_cb(const struct sk_buff * skb)283  static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284  {
285  	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286  	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287  }
288  
cobalt_get_enqueue_time(const struct sk_buff * skb)289  static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290  {
291  	return get_cobalt_cb(skb)->enqueue_time;
292  }
293  
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294  static void cobalt_set_enqueue_time(struct sk_buff *skb,
295  				    ktime_t now)
296  {
297  	get_cobalt_cb(skb)->enqueue_time = now;
298  }
299  
300  static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301  
302  /* Diffserv lookup tables */
303  
304  static const u8 precedence[] = {
305  	0, 0, 0, 0, 0, 0, 0, 0,
306  	1, 1, 1, 1, 1, 1, 1, 1,
307  	2, 2, 2, 2, 2, 2, 2, 2,
308  	3, 3, 3, 3, 3, 3, 3, 3,
309  	4, 4, 4, 4, 4, 4, 4, 4,
310  	5, 5, 5, 5, 5, 5, 5, 5,
311  	6, 6, 6, 6, 6, 6, 6, 6,
312  	7, 7, 7, 7, 7, 7, 7, 7,
313  };
314  
315  static const u8 diffserv8[] = {
316  	2, 0, 1, 2, 4, 2, 2, 2,
317  	1, 2, 1, 2, 1, 2, 1, 2,
318  	5, 2, 4, 2, 4, 2, 4, 2,
319  	3, 2, 3, 2, 3, 2, 3, 2,
320  	6, 2, 3, 2, 3, 2, 3, 2,
321  	6, 2, 2, 2, 6, 2, 6, 2,
322  	7, 2, 2, 2, 2, 2, 2, 2,
323  	7, 2, 2, 2, 2, 2, 2, 2,
324  };
325  
326  static const u8 diffserv4[] = {
327  	0, 1, 0, 0, 2, 0, 0, 0,
328  	1, 0, 0, 0, 0, 0, 0, 0,
329  	2, 0, 2, 0, 2, 0, 2, 0,
330  	2, 0, 2, 0, 2, 0, 2, 0,
331  	3, 0, 2, 0, 2, 0, 2, 0,
332  	3, 0, 0, 0, 3, 0, 3, 0,
333  	3, 0, 0, 0, 0, 0, 0, 0,
334  	3, 0, 0, 0, 0, 0, 0, 0,
335  };
336  
337  static const u8 diffserv3[] = {
338  	0, 1, 0, 0, 2, 0, 0, 0,
339  	1, 0, 0, 0, 0, 0, 0, 0,
340  	0, 0, 0, 0, 0, 0, 0, 0,
341  	0, 0, 0, 0, 0, 0, 0, 0,
342  	0, 0, 0, 0, 0, 0, 0, 0,
343  	0, 0, 0, 0, 2, 0, 2, 0,
344  	2, 0, 0, 0, 0, 0, 0, 0,
345  	2, 0, 0, 0, 0, 0, 0, 0,
346  };
347  
348  static const u8 besteffort[] = {
349  	0, 0, 0, 0, 0, 0, 0, 0,
350  	0, 0, 0, 0, 0, 0, 0, 0,
351  	0, 0, 0, 0, 0, 0, 0, 0,
352  	0, 0, 0, 0, 0, 0, 0, 0,
353  	0, 0, 0, 0, 0, 0, 0, 0,
354  	0, 0, 0, 0, 0, 0, 0, 0,
355  	0, 0, 0, 0, 0, 0, 0, 0,
356  	0, 0, 0, 0, 0, 0, 0, 0,
357  };
358  
359  /* tin priority order for stats dumping */
360  
361  static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362  static const u8 bulk_order[] = {1, 0, 2, 3};
363  
364  #define REC_INV_SQRT_CACHE (16)
365  static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
366  
367  /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
368   * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
369   *
370   * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
371   */
372  
cobalt_newton_step(struct cobalt_vars * vars)373  static void cobalt_newton_step(struct cobalt_vars *vars)
374  {
375  	u32 invsqrt, invsqrt2;
376  	u64 val;
377  
378  	invsqrt = vars->rec_inv_sqrt;
379  	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
380  	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
381  
382  	val >>= 2; /* avoid overflow in following multiply */
383  	val = (val * invsqrt) >> (32 - 2 + 1);
384  
385  	vars->rec_inv_sqrt = val;
386  }
387  
cobalt_invsqrt(struct cobalt_vars * vars)388  static void cobalt_invsqrt(struct cobalt_vars *vars)
389  {
390  	if (vars->count < REC_INV_SQRT_CACHE)
391  		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
392  	else
393  		cobalt_newton_step(vars);
394  }
395  
396  /* There is a big difference in timing between the accurate values placed in
397   * the cache and the approximations given by a single Newton step for small
398   * count values, particularly when stepping from count 1 to 2 or vice versa.
399   * Above 16, a single Newton step gives sufficient accuracy in either
400   * direction, given the precision stored.
401   *
402   * The magnitude of the error when stepping up to count 2 is such as to give
403   * the value that *should* have been produced at count 4.
404   */
405  
cobalt_cache_init(void)406  static void cobalt_cache_init(void)
407  {
408  	struct cobalt_vars v;
409  
410  	memset(&v, 0, sizeof(v));
411  	v.rec_inv_sqrt = ~0U;
412  	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
413  
414  	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
415  		cobalt_newton_step(&v);
416  		cobalt_newton_step(&v);
417  		cobalt_newton_step(&v);
418  		cobalt_newton_step(&v);
419  
420  		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
421  	}
422  }
423  
cobalt_vars_init(struct cobalt_vars * vars)424  static void cobalt_vars_init(struct cobalt_vars *vars)
425  {
426  	memset(vars, 0, sizeof(*vars));
427  
428  	if (!cobalt_rec_inv_sqrt_cache[0]) {
429  		cobalt_cache_init();
430  		cobalt_rec_inv_sqrt_cache[0] = ~0;
431  	}
432  }
433  
434  /* CoDel control_law is t + interval/sqrt(count)
435   * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
436   * both sqrt() and divide operation.
437   */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)438  static ktime_t cobalt_control(ktime_t t,
439  			      u64 interval,
440  			      u32 rec_inv_sqrt)
441  {
442  	return ktime_add_ns(t, reciprocal_scale(interval,
443  						rec_inv_sqrt));
444  }
445  
446  /* Call this when a packet had to be dropped due to queue overflow.  Returns
447   * true if the BLUE state was quiescent before but active after this call.
448   */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)449  static bool cobalt_queue_full(struct cobalt_vars *vars,
450  			      struct cobalt_params *p,
451  			      ktime_t now)
452  {
453  	bool up = false;
454  
455  	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
456  		up = !vars->p_drop;
457  		vars->p_drop += p->p_inc;
458  		if (vars->p_drop < p->p_inc)
459  			vars->p_drop = ~0;
460  		vars->blue_timer = now;
461  	}
462  	vars->dropping = true;
463  	vars->drop_next = now;
464  	if (!vars->count)
465  		vars->count = 1;
466  
467  	return up;
468  }
469  
470  /* Call this when the queue was serviced but turned out to be empty.  Returns
471   * true if the BLUE state was active before but quiescent after this call.
472   */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)473  static bool cobalt_queue_empty(struct cobalt_vars *vars,
474  			       struct cobalt_params *p,
475  			       ktime_t now)
476  {
477  	bool down = false;
478  
479  	if (vars->p_drop &&
480  	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
481  		if (vars->p_drop < p->p_dec)
482  			vars->p_drop = 0;
483  		else
484  			vars->p_drop -= p->p_dec;
485  		vars->blue_timer = now;
486  		down = !vars->p_drop;
487  	}
488  	vars->dropping = false;
489  
490  	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
491  		vars->count--;
492  		cobalt_invsqrt(vars);
493  		vars->drop_next = cobalt_control(vars->drop_next,
494  						 p->interval,
495  						 vars->rec_inv_sqrt);
496  	}
497  
498  	return down;
499  }
500  
501  /* Call this with a freshly dequeued packet for possible congestion marking.
502   * Returns true as an instruction to drop the packet, false for delivery.
503   */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)504  static bool cobalt_should_drop(struct cobalt_vars *vars,
505  			       struct cobalt_params *p,
506  			       ktime_t now,
507  			       struct sk_buff *skb,
508  			       u32 bulk_flows)
509  {
510  	bool next_due, over_target, drop = false;
511  	ktime_t schedule;
512  	u64 sojourn;
513  
514  /* The 'schedule' variable records, in its sign, whether 'now' is before or
515   * after 'drop_next'.  This allows 'drop_next' to be updated before the next
516   * scheduling decision is actually branched, without destroying that
517   * information.  Similarly, the first 'schedule' value calculated is preserved
518   * in the boolean 'next_due'.
519   *
520   * As for 'drop_next', we take advantage of the fact that 'interval' is both
521   * the delay between first exceeding 'target' and the first signalling event,
522   * *and* the scaling factor for the signalling frequency.  It's therefore very
523   * natural to use a single mechanism for both purposes, and eliminates a
524   * significant amount of reference Codel's spaghetti code.  To help with this,
525   * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
526   * as possible to 1.0 in fixed-point.
527   */
528  
529  	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
530  	schedule = ktime_sub(now, vars->drop_next);
531  	over_target = sojourn > p->target &&
532  		      sojourn > p->mtu_time * bulk_flows * 2 &&
533  		      sojourn > p->mtu_time * 4;
534  	next_due = vars->count && ktime_to_ns(schedule) >= 0;
535  
536  	vars->ecn_marked = false;
537  
538  	if (over_target) {
539  		if (!vars->dropping) {
540  			vars->dropping = true;
541  			vars->drop_next = cobalt_control(now,
542  							 p->interval,
543  							 vars->rec_inv_sqrt);
544  		}
545  		if (!vars->count)
546  			vars->count = 1;
547  	} else if (vars->dropping) {
548  		vars->dropping = false;
549  	}
550  
551  	if (next_due && vars->dropping) {
552  		/* Use ECN mark if possible, otherwise drop */
553  		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
554  
555  		vars->count++;
556  		if (!vars->count)
557  			vars->count--;
558  		cobalt_invsqrt(vars);
559  		vars->drop_next = cobalt_control(vars->drop_next,
560  						 p->interval,
561  						 vars->rec_inv_sqrt);
562  		schedule = ktime_sub(now, vars->drop_next);
563  	} else {
564  		while (next_due) {
565  			vars->count--;
566  			cobalt_invsqrt(vars);
567  			vars->drop_next = cobalt_control(vars->drop_next,
568  							 p->interval,
569  							 vars->rec_inv_sqrt);
570  			schedule = ktime_sub(now, vars->drop_next);
571  			next_due = vars->count && ktime_to_ns(schedule) >= 0;
572  		}
573  	}
574  
575  	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
576  	if (vars->p_drop)
577  		drop |= (get_random_u32() < vars->p_drop);
578  
579  	/* Overload the drop_next field as an activity timeout */
580  	if (!vars->count)
581  		vars->drop_next = ktime_add_ns(now, p->interval);
582  	else if (ktime_to_ns(schedule) > 0 && !drop)
583  		vars->drop_next = now;
584  
585  	return drop;
586  }
587  
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)588  static bool cake_update_flowkeys(struct flow_keys *keys,
589  				 const struct sk_buff *skb)
590  {
591  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
592  	struct nf_conntrack_tuple tuple = {};
593  	bool rev = !skb->_nfct, upd = false;
594  	__be32 ip;
595  
596  	if (skb_protocol(skb, true) != htons(ETH_P_IP))
597  		return false;
598  
599  	if (!nf_ct_get_tuple_skb(&tuple, skb))
600  		return false;
601  
602  	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
603  	if (ip != keys->addrs.v4addrs.src) {
604  		keys->addrs.v4addrs.src = ip;
605  		upd = true;
606  	}
607  	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
608  	if (ip != keys->addrs.v4addrs.dst) {
609  		keys->addrs.v4addrs.dst = ip;
610  		upd = true;
611  	}
612  
613  	if (keys->ports.ports) {
614  		__be16 port;
615  
616  		port = rev ? tuple.dst.u.all : tuple.src.u.all;
617  		if (port != keys->ports.src) {
618  			keys->ports.src = port;
619  			upd = true;
620  		}
621  		port = rev ? tuple.src.u.all : tuple.dst.u.all;
622  		if (port != keys->ports.dst) {
623  			port = keys->ports.dst;
624  			upd = true;
625  		}
626  	}
627  	return upd;
628  #else
629  	return false;
630  #endif
631  }
632  
633  /* Cake has several subtle multiple bit settings. In these cases you
634   *  would be matching triple isolate mode as well.
635   */
636  
cake_dsrc(int flow_mode)637  static bool cake_dsrc(int flow_mode)
638  {
639  	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
640  }
641  
cake_ddst(int flow_mode)642  static bool cake_ddst(int flow_mode)
643  {
644  	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
645  }
646  
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)647  static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
648  					     struct cake_flow *flow,
649  					     int flow_mode)
650  {
651  	if (likely(cake_dsrc(flow_mode) &&
652  		   q->hosts[flow->srchost].srchost_bulk_flow_count))
653  		q->hosts[flow->srchost].srchost_bulk_flow_count--;
654  }
655  
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)656  static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
657  					     struct cake_flow *flow,
658  					     int flow_mode)
659  {
660  	if (likely(cake_dsrc(flow_mode) &&
661  		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
662  		q->hosts[flow->srchost].srchost_bulk_flow_count++;
663  }
664  
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)665  static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
666  					     struct cake_flow *flow,
667  					     int flow_mode)
668  {
669  	if (likely(cake_ddst(flow_mode) &&
670  		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
671  		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
672  }
673  
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)674  static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
675  					     struct cake_flow *flow,
676  					     int flow_mode)
677  {
678  	if (likely(cake_ddst(flow_mode) &&
679  		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
680  		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
681  }
682  
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)683  static u16 cake_get_flow_quantum(struct cake_tin_data *q,
684  				 struct cake_flow *flow,
685  				 int flow_mode)
686  {
687  	u16 host_load = 1;
688  
689  	if (cake_dsrc(flow_mode))
690  		host_load = max(host_load,
691  				q->hosts[flow->srchost].srchost_bulk_flow_count);
692  
693  	if (cake_ddst(flow_mode))
694  		host_load = max(host_load,
695  				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
696  
697  	/* The get_random_u16() is a way to apply dithering to avoid
698  	 * accumulating roundoff errors
699  	 */
700  	return (q->flow_quantum * quantum_div[host_load] +
701  		get_random_u16()) >> 16;
702  }
703  
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)704  static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
705  		     int flow_mode, u16 flow_override, u16 host_override)
706  {
707  	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
708  	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
709  	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
710  	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
711  	u16 reduced_hash, srchost_idx, dsthost_idx;
712  	struct flow_keys keys, host_keys;
713  	bool use_skbhash = skb->l4_hash;
714  
715  	if (unlikely(flow_mode == CAKE_FLOW_NONE))
716  		return 0;
717  
718  	/* If both overrides are set, or we can use the SKB hash and nat mode is
719  	 * disabled, we can skip packet dissection entirely. If nat mode is
720  	 * enabled there's another check below after doing the conntrack lookup.
721  	 */
722  	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
723  		goto skip_hash;
724  
725  	skb_flow_dissect_flow_keys(skb, &keys,
726  				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
727  
728  	/* Don't use the SKB hash if we change the lookup keys from conntrack */
729  	if (nat_enabled && cake_update_flowkeys(&keys, skb))
730  		use_skbhash = false;
731  
732  	/* If we can still use the SKB hash and don't need the host hash, we can
733  	 * skip the rest of the hashing procedure
734  	 */
735  	if (use_skbhash && !hash_hosts)
736  		goto skip_hash;
737  
738  	/* flow_hash_from_keys() sorts the addresses by value, so we have
739  	 * to preserve their order in a separate data structure to treat
740  	 * src and dst host addresses as independently selectable.
741  	 */
742  	host_keys = keys;
743  	host_keys.ports.ports     = 0;
744  	host_keys.basic.ip_proto  = 0;
745  	host_keys.keyid.keyid     = 0;
746  	host_keys.tags.flow_label = 0;
747  
748  	switch (host_keys.control.addr_type) {
749  	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
750  		host_keys.addrs.v4addrs.src = 0;
751  		dsthost_hash = flow_hash_from_keys(&host_keys);
752  		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
753  		host_keys.addrs.v4addrs.dst = 0;
754  		srchost_hash = flow_hash_from_keys(&host_keys);
755  		break;
756  
757  	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
758  		memset(&host_keys.addrs.v6addrs.src, 0,
759  		       sizeof(host_keys.addrs.v6addrs.src));
760  		dsthost_hash = flow_hash_from_keys(&host_keys);
761  		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
762  		memset(&host_keys.addrs.v6addrs.dst, 0,
763  		       sizeof(host_keys.addrs.v6addrs.dst));
764  		srchost_hash = flow_hash_from_keys(&host_keys);
765  		break;
766  
767  	default:
768  		dsthost_hash = 0;
769  		srchost_hash = 0;
770  	}
771  
772  	/* This *must* be after the above switch, since as a
773  	 * side-effect it sorts the src and dst addresses.
774  	 */
775  	if (hash_flows && !use_skbhash)
776  		flow_hash = flow_hash_from_keys(&keys);
777  
778  skip_hash:
779  	if (flow_override)
780  		flow_hash = flow_override - 1;
781  	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
782  		flow_hash = skb->hash;
783  	if (host_override) {
784  		dsthost_hash = host_override - 1;
785  		srchost_hash = host_override - 1;
786  	}
787  
788  	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
789  		if (flow_mode & CAKE_FLOW_SRC_IP)
790  			flow_hash ^= srchost_hash;
791  
792  		if (flow_mode & CAKE_FLOW_DST_IP)
793  			flow_hash ^= dsthost_hash;
794  	}
795  
796  	reduced_hash = flow_hash % CAKE_QUEUES;
797  
798  	/* set-associative hashing */
799  	/* fast path if no hash collision (direct lookup succeeds) */
800  	if (likely(q->tags[reduced_hash] == flow_hash &&
801  		   q->flows[reduced_hash].set)) {
802  		q->way_directs++;
803  	} else {
804  		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
805  		u32 outer_hash = reduced_hash - inner_hash;
806  		bool allocate_src = false;
807  		bool allocate_dst = false;
808  		u32 i, k;
809  
810  		/* check if any active queue in the set is reserved for
811  		 * this flow.
812  		 */
813  		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
814  		     i++, k = (k + 1) % CAKE_SET_WAYS) {
815  			if (q->tags[outer_hash + k] == flow_hash) {
816  				if (i)
817  					q->way_hits++;
818  
819  				if (!q->flows[outer_hash + k].set) {
820  					/* need to increment host refcnts */
821  					allocate_src = cake_dsrc(flow_mode);
822  					allocate_dst = cake_ddst(flow_mode);
823  				}
824  
825  				goto found;
826  			}
827  		}
828  
829  		/* no queue is reserved for this flow, look for an
830  		 * empty one.
831  		 */
832  		for (i = 0; i < CAKE_SET_WAYS;
833  			 i++, k = (k + 1) % CAKE_SET_WAYS) {
834  			if (!q->flows[outer_hash + k].set) {
835  				q->way_misses++;
836  				allocate_src = cake_dsrc(flow_mode);
837  				allocate_dst = cake_ddst(flow_mode);
838  				goto found;
839  			}
840  		}
841  
842  		/* With no empty queues, default to the original
843  		 * queue, accept the collision, update the host tags.
844  		 */
845  		q->way_collisions++;
846  		allocate_src = cake_dsrc(flow_mode);
847  		allocate_dst = cake_ddst(flow_mode);
848  
849  		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
850  			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851  			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
852  		}
853  found:
854  		/* reserve queue for future packets in same flow */
855  		reduced_hash = outer_hash + k;
856  		q->tags[reduced_hash] = flow_hash;
857  
858  		if (allocate_src) {
859  			srchost_idx = srchost_hash % CAKE_QUEUES;
860  			inner_hash = srchost_idx % CAKE_SET_WAYS;
861  			outer_hash = srchost_idx - inner_hash;
862  			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
863  				i++, k = (k + 1) % CAKE_SET_WAYS) {
864  				if (q->hosts[outer_hash + k].srchost_tag ==
865  				    srchost_hash)
866  					goto found_src;
867  			}
868  			for (i = 0; i < CAKE_SET_WAYS;
869  				i++, k = (k + 1) % CAKE_SET_WAYS) {
870  				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
871  					break;
872  			}
873  			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
874  found_src:
875  			srchost_idx = outer_hash + k;
876  			q->flows[reduced_hash].srchost = srchost_idx;
877  
878  			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
879  				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
880  		}
881  
882  		if (allocate_dst) {
883  			dsthost_idx = dsthost_hash % CAKE_QUEUES;
884  			inner_hash = dsthost_idx % CAKE_SET_WAYS;
885  			outer_hash = dsthost_idx - inner_hash;
886  			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
887  			     i++, k = (k + 1) % CAKE_SET_WAYS) {
888  				if (q->hosts[outer_hash + k].dsthost_tag ==
889  				    dsthost_hash)
890  					goto found_dst;
891  			}
892  			for (i = 0; i < CAKE_SET_WAYS;
893  			     i++, k = (k + 1) % CAKE_SET_WAYS) {
894  				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
895  					break;
896  			}
897  			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
898  found_dst:
899  			dsthost_idx = outer_hash + k;
900  			q->flows[reduced_hash].dsthost = dsthost_idx;
901  
902  			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
903  				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
904  		}
905  	}
906  
907  	return reduced_hash;
908  }
909  
910  /* helper functions : might be changed when/if skb use a standard list_head */
911  /* remove one skb from head of slot queue */
912  
dequeue_head(struct cake_flow * flow)913  static struct sk_buff *dequeue_head(struct cake_flow *flow)
914  {
915  	struct sk_buff *skb = flow->head;
916  
917  	if (skb) {
918  		flow->head = skb->next;
919  		skb_mark_not_on_list(skb);
920  	}
921  
922  	return skb;
923  }
924  
925  /* add skb to flow queue (tail add) */
926  
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)927  static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
928  {
929  	if (!flow->head)
930  		flow->head = skb;
931  	else
932  		flow->tail->next = skb;
933  	flow->tail = skb;
934  	skb->next = NULL;
935  }
936  
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)937  static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
938  				    struct ipv6hdr *buf)
939  {
940  	unsigned int offset = skb_network_offset(skb);
941  	struct iphdr *iph;
942  
943  	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
944  
945  	if (!iph)
946  		return NULL;
947  
948  	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
949  		return skb_header_pointer(skb, offset + iph->ihl * 4,
950  					  sizeof(struct ipv6hdr), buf);
951  
952  	else if (iph->version == 4)
953  		return iph;
954  
955  	else if (iph->version == 6)
956  		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
957  					  buf);
958  
959  	return NULL;
960  }
961  
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)962  static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
963  				      void *buf, unsigned int bufsize)
964  {
965  	unsigned int offset = skb_network_offset(skb);
966  	const struct ipv6hdr *ipv6h;
967  	const struct tcphdr *tcph;
968  	const struct iphdr *iph;
969  	struct ipv6hdr _ipv6h;
970  	struct tcphdr _tcph;
971  
972  	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
973  
974  	if (!ipv6h)
975  		return NULL;
976  
977  	if (ipv6h->version == 4) {
978  		iph = (struct iphdr *)ipv6h;
979  		offset += iph->ihl * 4;
980  
981  		/* special-case 6in4 tunnelling, as that is a common way to get
982  		 * v6 connectivity in the home
983  		 */
984  		if (iph->protocol == IPPROTO_IPV6) {
985  			ipv6h = skb_header_pointer(skb, offset,
986  						   sizeof(_ipv6h), &_ipv6h);
987  
988  			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
989  				return NULL;
990  
991  			offset += sizeof(struct ipv6hdr);
992  
993  		} else if (iph->protocol != IPPROTO_TCP) {
994  			return NULL;
995  		}
996  
997  	} else if (ipv6h->version == 6) {
998  		if (ipv6h->nexthdr != IPPROTO_TCP)
999  			return NULL;
1000  
1001  		offset += sizeof(struct ipv6hdr);
1002  	} else {
1003  		return NULL;
1004  	}
1005  
1006  	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1007  	if (!tcph || tcph->doff < 5)
1008  		return NULL;
1009  
1010  	return skb_header_pointer(skb, offset,
1011  				  min(__tcp_hdrlen(tcph), bufsize), buf);
1012  }
1013  
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1014  static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1015  				   int code, int *oplen)
1016  {
1017  	/* inspired by tcp_parse_options in tcp_input.c */
1018  	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1019  	const u8 *ptr = (const u8 *)(tcph + 1);
1020  
1021  	while (length > 0) {
1022  		int opcode = *ptr++;
1023  		int opsize;
1024  
1025  		if (opcode == TCPOPT_EOL)
1026  			break;
1027  		if (opcode == TCPOPT_NOP) {
1028  			length--;
1029  			continue;
1030  		}
1031  		if (length < 2)
1032  			break;
1033  		opsize = *ptr++;
1034  		if (opsize < 2 || opsize > length)
1035  			break;
1036  
1037  		if (opcode == code) {
1038  			*oplen = opsize;
1039  			return ptr;
1040  		}
1041  
1042  		ptr += opsize - 2;
1043  		length -= opsize;
1044  	}
1045  
1046  	return NULL;
1047  }
1048  
1049  /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1050   * bytes than the other. In the case where both sequences ACKs bytes that the
1051   * other doesn't, A is considered greater. DSACKs in A also makes A be
1052   * considered greater.
1053   *
1054   * @return -1, 0 or 1 as normal compare functions
1055   */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1056  static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1057  				  const struct tcphdr *tcph_b)
1058  {
1059  	const struct tcp_sack_block_wire *sack_a, *sack_b;
1060  	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1061  	u32 bytes_a = 0, bytes_b = 0;
1062  	int oplen_a, oplen_b;
1063  	bool first = true;
1064  
1065  	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1066  	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1067  
1068  	/* pointers point to option contents */
1069  	oplen_a -= TCPOLEN_SACK_BASE;
1070  	oplen_b -= TCPOLEN_SACK_BASE;
1071  
1072  	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1073  	    (!sack_b || oplen_b < sizeof(*sack_b)))
1074  		return -1;
1075  	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1076  		 (!sack_a || oplen_a < sizeof(*sack_a)))
1077  		return 1;
1078  	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1079  		 (!sack_b || oplen_b < sizeof(*sack_b)))
1080  		return 0;
1081  
1082  	while (oplen_a >= sizeof(*sack_a)) {
1083  		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1084  		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1085  		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1086  		int oplen_tmp = oplen_b;
1087  		bool found = false;
1088  
1089  		/* DSACK; always considered greater to prevent dropping */
1090  		if (before(start_a, ack_seq_a))
1091  			return -1;
1092  
1093  		bytes_a += end_a - start_a;
1094  
1095  		while (oplen_tmp >= sizeof(*sack_tmp)) {
1096  			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1097  			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1098  
1099  			/* first time through we count the total size */
1100  			if (first)
1101  				bytes_b += end_b - start_b;
1102  
1103  			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1104  				found = true;
1105  				if (!first)
1106  					break;
1107  			}
1108  			oplen_tmp -= sizeof(*sack_tmp);
1109  			sack_tmp++;
1110  		}
1111  
1112  		if (!found)
1113  			return -1;
1114  
1115  		oplen_a -= sizeof(*sack_a);
1116  		sack_a++;
1117  		first = false;
1118  	}
1119  
1120  	/* If we made it this far, all ranges SACKed by A are covered by B, so
1121  	 * either the SACKs are equal, or B SACKs more bytes.
1122  	 */
1123  	return bytes_b > bytes_a ? 1 : 0;
1124  }
1125  
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1126  static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1127  				 u32 *tsval, u32 *tsecr)
1128  {
1129  	const u8 *ptr;
1130  	int opsize;
1131  
1132  	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1133  
1134  	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1135  		*tsval = get_unaligned_be32(ptr);
1136  		*tsecr = get_unaligned_be32(ptr + 4);
1137  	}
1138  }
1139  
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1140  static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1141  			       u32 tstamp_new, u32 tsecr_new)
1142  {
1143  	/* inspired by tcp_parse_options in tcp_input.c */
1144  	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1145  	const u8 *ptr = (const u8 *)(tcph + 1);
1146  	u32 tstamp, tsecr;
1147  
1148  	/* 3 reserved flags must be unset to avoid future breakage
1149  	 * ACK must be set
1150  	 * ECE/CWR are handled separately
1151  	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1152  	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1153  	 * 0x00C00000 = CWR/ECE (handled separately)
1154  	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1155  	 */
1156  	if (((tcp_flag_word(tcph) &
1157  	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1158  		return false;
1159  
1160  	while (length > 0) {
1161  		int opcode = *ptr++;
1162  		int opsize;
1163  
1164  		if (opcode == TCPOPT_EOL)
1165  			break;
1166  		if (opcode == TCPOPT_NOP) {
1167  			length--;
1168  			continue;
1169  		}
1170  		if (length < 2)
1171  			break;
1172  		opsize = *ptr++;
1173  		if (opsize < 2 || opsize > length)
1174  			break;
1175  
1176  		switch (opcode) {
1177  		case TCPOPT_MD5SIG: /* doesn't influence state */
1178  			break;
1179  
1180  		case TCPOPT_SACK: /* stricter checking performed later */
1181  			if (opsize % 8 != 2)
1182  				return false;
1183  			break;
1184  
1185  		case TCPOPT_TIMESTAMP:
1186  			/* only drop timestamps lower than new */
1187  			if (opsize != TCPOLEN_TIMESTAMP)
1188  				return false;
1189  			tstamp = get_unaligned_be32(ptr);
1190  			tsecr = get_unaligned_be32(ptr + 4);
1191  			if (after(tstamp, tstamp_new) ||
1192  			    after(tsecr, tsecr_new))
1193  				return false;
1194  			break;
1195  
1196  		case TCPOPT_MSS:  /* these should only be set on SYN */
1197  		case TCPOPT_WINDOW:
1198  		case TCPOPT_SACK_PERM:
1199  		case TCPOPT_FASTOPEN:
1200  		case TCPOPT_EXP:
1201  		default: /* don't drop if any unknown options are present */
1202  			return false;
1203  		}
1204  
1205  		ptr += opsize - 2;
1206  		length -= opsize;
1207  	}
1208  
1209  	return true;
1210  }
1211  
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1212  static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1213  				       struct cake_flow *flow)
1214  {
1215  	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1216  	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1217  	struct sk_buff *skb_check, *skb_prev = NULL;
1218  	const struct ipv6hdr *ipv6h, *ipv6h_check;
1219  	unsigned char _tcph[64], _tcph_check[64];
1220  	const struct tcphdr *tcph, *tcph_check;
1221  	const struct iphdr *iph, *iph_check;
1222  	struct ipv6hdr _iph, _iph_check;
1223  	const struct sk_buff *skb;
1224  	int seglen, num_found = 0;
1225  	u32 tstamp = 0, tsecr = 0;
1226  	__be32 elig_flags = 0;
1227  	int sack_comp;
1228  
1229  	/* no other possible ACKs to filter */
1230  	if (flow->head == flow->tail)
1231  		return NULL;
1232  
1233  	skb = flow->tail;
1234  	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1235  	iph = cake_get_iphdr(skb, &_iph);
1236  	if (!tcph)
1237  		return NULL;
1238  
1239  	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1240  
1241  	/* the 'triggering' packet need only have the ACK flag set.
1242  	 * also check that SYN is not set, as there won't be any previous ACKs.
1243  	 */
1244  	if ((tcp_flag_word(tcph) &
1245  	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1246  		return NULL;
1247  
1248  	/* the 'triggering' ACK is at the tail of the queue, we have already
1249  	 * returned if it is the only packet in the flow. loop through the rest
1250  	 * of the queue looking for pure ACKs with the same 5-tuple as the
1251  	 * triggering one.
1252  	 */
1253  	for (skb_check = flow->head;
1254  	     skb_check && skb_check != skb;
1255  	     skb_prev = skb_check, skb_check = skb_check->next) {
1256  		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1257  		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1258  					     sizeof(_tcph_check));
1259  
1260  		/* only TCP packets with matching 5-tuple are eligible, and only
1261  		 * drop safe headers
1262  		 */
1263  		if (!tcph_check || iph->version != iph_check->version ||
1264  		    tcph_check->source != tcph->source ||
1265  		    tcph_check->dest != tcph->dest)
1266  			continue;
1267  
1268  		if (iph_check->version == 4) {
1269  			if (iph_check->saddr != iph->saddr ||
1270  			    iph_check->daddr != iph->daddr)
1271  				continue;
1272  
1273  			seglen = iph_totlen(skb, iph_check) -
1274  				       (4 * iph_check->ihl);
1275  		} else if (iph_check->version == 6) {
1276  			ipv6h = (struct ipv6hdr *)iph;
1277  			ipv6h_check = (struct ipv6hdr *)iph_check;
1278  
1279  			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1280  			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1281  				continue;
1282  
1283  			seglen = ntohs(ipv6h_check->payload_len);
1284  		} else {
1285  			WARN_ON(1);  /* shouldn't happen */
1286  			continue;
1287  		}
1288  
1289  		/* If the ECE/CWR flags changed from the previous eligible
1290  		 * packet in the same flow, we should no longer be dropping that
1291  		 * previous packet as this would lose information.
1292  		 */
1293  		if (elig_ack && (tcp_flag_word(tcph_check) &
1294  				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1295  			elig_ack = NULL;
1296  			elig_ack_prev = NULL;
1297  			num_found--;
1298  		}
1299  
1300  		/* Check TCP options and flags, don't drop ACKs with segment
1301  		 * data, and don't drop ACKs with a higher cumulative ACK
1302  		 * counter than the triggering packet. Check ACK seqno here to
1303  		 * avoid parsing SACK options of packets we are going to exclude
1304  		 * anyway.
1305  		 */
1306  		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1307  		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1308  		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1309  			continue;
1310  
1311  		/* Check SACK options. The triggering packet must SACK more data
1312  		 * than the ACK under consideration, or SACK the same range but
1313  		 * have a larger cumulative ACK counter. The latter is a
1314  		 * pathological case, but is contained in the following check
1315  		 * anyway, just to be safe.
1316  		 */
1317  		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1318  
1319  		if (sack_comp < 0 ||
1320  		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1321  		     sack_comp == 0))
1322  			continue;
1323  
1324  		/* At this point we have found an eligible pure ACK to drop; if
1325  		 * we are in aggressive mode, we are done. Otherwise, keep
1326  		 * searching unless this is the second eligible ACK we
1327  		 * found.
1328  		 *
1329  		 * Since we want to drop ACK closest to the head of the queue,
1330  		 * save the first eligible ACK we find, even if we need to loop
1331  		 * again.
1332  		 */
1333  		if (!elig_ack) {
1334  			elig_ack = skb_check;
1335  			elig_ack_prev = skb_prev;
1336  			elig_flags = (tcp_flag_word(tcph_check)
1337  				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1338  		}
1339  
1340  		if (num_found++ > 0)
1341  			goto found;
1342  	}
1343  
1344  	/* We made it through the queue without finding two eligible ACKs . If
1345  	 * we found a single eligible ACK we can drop it in aggressive mode if
1346  	 * we can guarantee that this does not interfere with ECN flag
1347  	 * information. We ensure this by dropping it only if the enqueued
1348  	 * packet is consecutive with the eligible ACK, and their flags match.
1349  	 */
1350  	if (elig_ack && aggressive && elig_ack->next == skb &&
1351  	    (elig_flags == (tcp_flag_word(tcph) &
1352  			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1353  		goto found;
1354  
1355  	return NULL;
1356  
1357  found:
1358  	if (elig_ack_prev)
1359  		elig_ack_prev->next = elig_ack->next;
1360  	else
1361  		flow->head = elig_ack->next;
1362  
1363  	skb_mark_not_on_list(elig_ack);
1364  
1365  	return elig_ack;
1366  }
1367  
cake_ewma(u64 avg,u64 sample,u32 shift)1368  static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1369  {
1370  	avg -= avg >> shift;
1371  	avg += sample >> shift;
1372  	return avg;
1373  }
1374  
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1375  static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1376  {
1377  	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1378  		len -= off;
1379  
1380  	if (q->max_netlen < len)
1381  		q->max_netlen = len;
1382  	if (q->min_netlen > len)
1383  		q->min_netlen = len;
1384  
1385  	len += q->rate_overhead;
1386  
1387  	if (len < q->rate_mpu)
1388  		len = q->rate_mpu;
1389  
1390  	if (q->atm_mode == CAKE_ATM_ATM) {
1391  		len += 47;
1392  		len /= 48;
1393  		len *= 53;
1394  	} else if (q->atm_mode == CAKE_ATM_PTM) {
1395  		/* Add one byte per 64 bytes or part thereof.
1396  		 * This is conservative and easier to calculate than the
1397  		 * precise value.
1398  		 */
1399  		len += (len + 63) / 64;
1400  	}
1401  
1402  	if (q->max_adjlen < len)
1403  		q->max_adjlen = len;
1404  	if (q->min_adjlen > len)
1405  		q->min_adjlen = len;
1406  
1407  	return len;
1408  }
1409  
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1410  static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1411  {
1412  	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1413  	unsigned int hdr_len, last_len = 0;
1414  	u32 off = skb_network_offset(skb);
1415  	u32 len = qdisc_pkt_len(skb);
1416  	u16 segs = 1;
1417  
1418  	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1419  
1420  	if (!shinfo->gso_size)
1421  		return cake_calc_overhead(q, len, off);
1422  
1423  	/* borrowed from qdisc_pkt_len_init() */
1424  	hdr_len = skb_transport_offset(skb);
1425  
1426  	/* + transport layer */
1427  	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1428  						SKB_GSO_TCPV6))) {
1429  		const struct tcphdr *th;
1430  		struct tcphdr _tcphdr;
1431  
1432  		th = skb_header_pointer(skb, hdr_len,
1433  					sizeof(_tcphdr), &_tcphdr);
1434  		if (likely(th))
1435  			hdr_len += __tcp_hdrlen(th);
1436  	} else {
1437  		struct udphdr _udphdr;
1438  
1439  		if (skb_header_pointer(skb, hdr_len,
1440  				       sizeof(_udphdr), &_udphdr))
1441  			hdr_len += sizeof(struct udphdr);
1442  	}
1443  
1444  	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1445  		segs = DIV_ROUND_UP(skb->len - hdr_len,
1446  				    shinfo->gso_size);
1447  	else
1448  		segs = shinfo->gso_segs;
1449  
1450  	len = shinfo->gso_size + hdr_len;
1451  	last_len = skb->len - shinfo->gso_size * (segs - 1);
1452  
1453  	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1454  		cake_calc_overhead(q, last_len, off));
1455  }
1456  
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1457  static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1458  {
1459  	struct cake_heap_entry ii = q->overflow_heap[i];
1460  	struct cake_heap_entry jj = q->overflow_heap[j];
1461  
1462  	q->overflow_heap[i] = jj;
1463  	q->overflow_heap[j] = ii;
1464  
1465  	q->tins[ii.t].overflow_idx[ii.b] = j;
1466  	q->tins[jj.t].overflow_idx[jj.b] = i;
1467  }
1468  
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1469  static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1470  {
1471  	struct cake_heap_entry ii = q->overflow_heap[i];
1472  
1473  	return q->tins[ii.t].backlogs[ii.b];
1474  }
1475  
cake_heapify(struct cake_sched_data * q,u16 i)1476  static void cake_heapify(struct cake_sched_data *q, u16 i)
1477  {
1478  	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1479  	u32 mb = cake_heap_get_backlog(q, i);
1480  	u32 m = i;
1481  
1482  	while (m < a) {
1483  		u32 l = m + m + 1;
1484  		u32 r = l + 1;
1485  
1486  		if (l < a) {
1487  			u32 lb = cake_heap_get_backlog(q, l);
1488  
1489  			if (lb > mb) {
1490  				m  = l;
1491  				mb = lb;
1492  			}
1493  		}
1494  
1495  		if (r < a) {
1496  			u32 rb = cake_heap_get_backlog(q, r);
1497  
1498  			if (rb > mb) {
1499  				m  = r;
1500  				mb = rb;
1501  			}
1502  		}
1503  
1504  		if (m != i) {
1505  			cake_heap_swap(q, i, m);
1506  			i = m;
1507  		} else {
1508  			break;
1509  		}
1510  	}
1511  }
1512  
cake_heapify_up(struct cake_sched_data * q,u16 i)1513  static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1514  {
1515  	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1516  		u16 p = (i - 1) >> 1;
1517  		u32 ib = cake_heap_get_backlog(q, i);
1518  		u32 pb = cake_heap_get_backlog(q, p);
1519  
1520  		if (ib > pb) {
1521  			cake_heap_swap(q, i, p);
1522  			i = p;
1523  		} else {
1524  			break;
1525  		}
1526  	}
1527  }
1528  
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1529  static int cake_advance_shaper(struct cake_sched_data *q,
1530  			       struct cake_tin_data *b,
1531  			       struct sk_buff *skb,
1532  			       ktime_t now, bool drop)
1533  {
1534  	u32 len = get_cobalt_cb(skb)->adjusted_len;
1535  
1536  	/* charge packet bandwidth to this tin
1537  	 * and to the global shaper.
1538  	 */
1539  	if (q->rate_ns) {
1540  		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1541  		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1542  		u64 failsafe_dur = global_dur + (global_dur >> 1);
1543  
1544  		if (ktime_before(b->time_next_packet, now))
1545  			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1546  							   tin_dur);
1547  
1548  		else if (ktime_before(b->time_next_packet,
1549  				      ktime_add_ns(now, tin_dur)))
1550  			b->time_next_packet = ktime_add_ns(now, tin_dur);
1551  
1552  		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1553  						   global_dur);
1554  		if (!drop)
1555  			q->failsafe_next_packet = \
1556  				ktime_add_ns(q->failsafe_next_packet,
1557  					     failsafe_dur);
1558  	}
1559  	return len;
1560  }
1561  
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1562  static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1563  {
1564  	struct cake_sched_data *q = qdisc_priv(sch);
1565  	ktime_t now = ktime_get();
1566  	u32 idx = 0, tin = 0, len;
1567  	struct cake_heap_entry qq;
1568  	struct cake_tin_data *b;
1569  	struct cake_flow *flow;
1570  	struct sk_buff *skb;
1571  
1572  	if (!q->overflow_timeout) {
1573  		int i;
1574  		/* Build fresh max-heap */
1575  		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1576  			cake_heapify(q, i);
1577  	}
1578  	q->overflow_timeout = 65535;
1579  
1580  	/* select longest queue for pruning */
1581  	qq  = q->overflow_heap[0];
1582  	tin = qq.t;
1583  	idx = qq.b;
1584  
1585  	b = &q->tins[tin];
1586  	flow = &b->flows[idx];
1587  	skb = dequeue_head(flow);
1588  	if (unlikely(!skb)) {
1589  		/* heap has gone wrong, rebuild it next time */
1590  		q->overflow_timeout = 0;
1591  		return idx + (tin << 16);
1592  	}
1593  
1594  	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1595  		b->unresponsive_flow_count++;
1596  
1597  	len = qdisc_pkt_len(skb);
1598  	q->buffer_used      -= skb->truesize;
1599  	b->backlogs[idx]    -= len;
1600  	b->tin_backlog      -= len;
1601  	sch->qstats.backlog -= len;
1602  
1603  	flow->dropped++;
1604  	b->tin_dropped++;
1605  	sch->qstats.drops++;
1606  
1607  	if (q->rate_flags & CAKE_FLAG_INGRESS)
1608  		cake_advance_shaper(q, b, skb, now, true);
1609  
1610  	__qdisc_drop(skb, to_free);
1611  	sch->q.qlen--;
1612  	qdisc_tree_reduce_backlog(sch, 1, len);
1613  
1614  	cake_heapify(q, 0);
1615  
1616  	return idx + (tin << 16);
1617  }
1618  
cake_handle_diffserv(struct sk_buff * skb,bool wash)1619  static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1620  {
1621  	const int offset = skb_network_offset(skb);
1622  	u16 *buf, buf_;
1623  	u8 dscp;
1624  
1625  	switch (skb_protocol(skb, true)) {
1626  	case htons(ETH_P_IP):
1627  		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1628  		if (unlikely(!buf))
1629  			return 0;
1630  
1631  		/* ToS is in the second byte of iphdr */
1632  		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1633  
1634  		if (wash && dscp) {
1635  			const int wlen = offset + sizeof(struct iphdr);
1636  
1637  			if (!pskb_may_pull(skb, wlen) ||
1638  			    skb_try_make_writable(skb, wlen))
1639  				return 0;
1640  
1641  			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1642  		}
1643  
1644  		return dscp;
1645  
1646  	case htons(ETH_P_IPV6):
1647  		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1648  		if (unlikely(!buf))
1649  			return 0;
1650  
1651  		/* Traffic class is in the first and second bytes of ipv6hdr */
1652  		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1653  
1654  		if (wash && dscp) {
1655  			const int wlen = offset + sizeof(struct ipv6hdr);
1656  
1657  			if (!pskb_may_pull(skb, wlen) ||
1658  			    skb_try_make_writable(skb, wlen))
1659  				return 0;
1660  
1661  			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1662  		}
1663  
1664  		return dscp;
1665  
1666  	case htons(ETH_P_ARP):
1667  		return 0x38;  /* CS7 - Net Control */
1668  
1669  	default:
1670  		/* If there is no Diffserv field, treat as best-effort */
1671  		return 0;
1672  	}
1673  }
1674  
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1675  static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1676  					     struct sk_buff *skb)
1677  {
1678  	struct cake_sched_data *q = qdisc_priv(sch);
1679  	u32 tin, mark;
1680  	bool wash;
1681  	u8 dscp;
1682  
1683  	/* Tin selection: Default to diffserv-based selection, allow overriding
1684  	 * using firewall marks or skb->priority. Call DSCP parsing early if
1685  	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1686  	 */
1687  	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1688  	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1689  	if (wash)
1690  		dscp = cake_handle_diffserv(skb, wash);
1691  
1692  	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1693  		tin = 0;
1694  
1695  	else if (mark && mark <= q->tin_cnt)
1696  		tin = q->tin_order[mark - 1];
1697  
1698  	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1699  		 TC_H_MIN(skb->priority) > 0 &&
1700  		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1701  		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1702  
1703  	else {
1704  		if (!wash)
1705  			dscp = cake_handle_diffserv(skb, wash);
1706  		tin = q->tin_index[dscp];
1707  
1708  		if (unlikely(tin >= q->tin_cnt))
1709  			tin = 0;
1710  	}
1711  
1712  	return &q->tins[tin];
1713  }
1714  
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1715  static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1716  			 struct sk_buff *skb, int flow_mode, int *qerr)
1717  {
1718  	struct cake_sched_data *q = qdisc_priv(sch);
1719  	struct tcf_proto *filter;
1720  	struct tcf_result res;
1721  	u16 flow = 0, host = 0;
1722  	int result;
1723  
1724  	filter = rcu_dereference_bh(q->filter_list);
1725  	if (!filter)
1726  		goto hash;
1727  
1728  	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1729  	result = tcf_classify(skb, NULL, filter, &res, false);
1730  
1731  	if (result >= 0) {
1732  #ifdef CONFIG_NET_CLS_ACT
1733  		switch (result) {
1734  		case TC_ACT_STOLEN:
1735  		case TC_ACT_QUEUED:
1736  		case TC_ACT_TRAP:
1737  			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1738  			fallthrough;
1739  		case TC_ACT_SHOT:
1740  			return 0;
1741  		}
1742  #endif
1743  		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1744  			flow = TC_H_MIN(res.classid);
1745  		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1746  			host = TC_H_MAJ(res.classid) >> 16;
1747  	}
1748  hash:
1749  	*t = cake_select_tin(sch, skb);
1750  	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1751  }
1752  
1753  static void cake_reconfigure(struct Qdisc *sch);
1754  
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1755  static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1756  			struct sk_buff **to_free)
1757  {
1758  	struct cake_sched_data *q = qdisc_priv(sch);
1759  	int len = qdisc_pkt_len(skb);
1760  	int ret;
1761  	struct sk_buff *ack = NULL;
1762  	ktime_t now = ktime_get();
1763  	struct cake_tin_data *b;
1764  	struct cake_flow *flow;
1765  	u32 idx;
1766  
1767  	/* choose flow to insert into */
1768  	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1769  	if (idx == 0) {
1770  		if (ret & __NET_XMIT_BYPASS)
1771  			qdisc_qstats_drop(sch);
1772  		__qdisc_drop(skb, to_free);
1773  		return ret;
1774  	}
1775  	idx--;
1776  	flow = &b->flows[idx];
1777  
1778  	/* ensure shaper state isn't stale */
1779  	if (!b->tin_backlog) {
1780  		if (ktime_before(b->time_next_packet, now))
1781  			b->time_next_packet = now;
1782  
1783  		if (!sch->q.qlen) {
1784  			if (ktime_before(q->time_next_packet, now)) {
1785  				q->failsafe_next_packet = now;
1786  				q->time_next_packet = now;
1787  			} else if (ktime_after(q->time_next_packet, now) &&
1788  				   ktime_after(q->failsafe_next_packet, now)) {
1789  				u64 next = \
1790  					min(ktime_to_ns(q->time_next_packet),
1791  					    ktime_to_ns(
1792  						   q->failsafe_next_packet));
1793  				sch->qstats.overlimits++;
1794  				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1795  			}
1796  		}
1797  	}
1798  
1799  	if (unlikely(len > b->max_skblen))
1800  		b->max_skblen = len;
1801  
1802  	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1803  		struct sk_buff *segs, *nskb;
1804  		netdev_features_t features = netif_skb_features(skb);
1805  		unsigned int slen = 0, numsegs = 0;
1806  
1807  		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1808  		if (IS_ERR_OR_NULL(segs))
1809  			return qdisc_drop(skb, sch, to_free);
1810  
1811  		skb_list_walk_safe(segs, segs, nskb) {
1812  			skb_mark_not_on_list(segs);
1813  			qdisc_skb_cb(segs)->pkt_len = segs->len;
1814  			cobalt_set_enqueue_time(segs, now);
1815  			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1816  									  segs);
1817  			flow_queue_add(flow, segs);
1818  
1819  			sch->q.qlen++;
1820  			numsegs++;
1821  			slen += segs->len;
1822  			q->buffer_used += segs->truesize;
1823  			b->packets++;
1824  		}
1825  
1826  		/* stats */
1827  		b->bytes	    += slen;
1828  		b->backlogs[idx]    += slen;
1829  		b->tin_backlog      += slen;
1830  		sch->qstats.backlog += slen;
1831  		q->avg_window_bytes += slen;
1832  
1833  		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1834  		consume_skb(skb);
1835  	} else {
1836  		/* not splitting */
1837  		cobalt_set_enqueue_time(skb, now);
1838  		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1839  		flow_queue_add(flow, skb);
1840  
1841  		if (q->ack_filter)
1842  			ack = cake_ack_filter(q, flow);
1843  
1844  		if (ack) {
1845  			b->ack_drops++;
1846  			sch->qstats.drops++;
1847  			b->bytes += qdisc_pkt_len(ack);
1848  			len -= qdisc_pkt_len(ack);
1849  			q->buffer_used += skb->truesize - ack->truesize;
1850  			if (q->rate_flags & CAKE_FLAG_INGRESS)
1851  				cake_advance_shaper(q, b, ack, now, true);
1852  
1853  			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1854  			consume_skb(ack);
1855  		} else {
1856  			sch->q.qlen++;
1857  			q->buffer_used      += skb->truesize;
1858  		}
1859  
1860  		/* stats */
1861  		b->packets++;
1862  		b->bytes	    += len;
1863  		b->backlogs[idx]    += len;
1864  		b->tin_backlog      += len;
1865  		sch->qstats.backlog += len;
1866  		q->avg_window_bytes += len;
1867  	}
1868  
1869  	if (q->overflow_timeout)
1870  		cake_heapify_up(q, b->overflow_idx[idx]);
1871  
1872  	/* incoming bandwidth capacity estimate */
1873  	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1874  		u64 packet_interval = \
1875  			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1876  
1877  		if (packet_interval > NSEC_PER_SEC)
1878  			packet_interval = NSEC_PER_SEC;
1879  
1880  		/* filter out short-term bursts, eg. wifi aggregation */
1881  		q->avg_packet_interval = \
1882  			cake_ewma(q->avg_packet_interval,
1883  				  packet_interval,
1884  				  (packet_interval > q->avg_packet_interval ?
1885  					  2 : 8));
1886  
1887  		q->last_packet_time = now;
1888  
1889  		if (packet_interval > q->avg_packet_interval) {
1890  			u64 window_interval = \
1891  				ktime_to_ns(ktime_sub(now,
1892  						      q->avg_window_begin));
1893  			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1894  
1895  			b = div64_u64(b, window_interval);
1896  			q->avg_peak_bandwidth =
1897  				cake_ewma(q->avg_peak_bandwidth, b,
1898  					  b > q->avg_peak_bandwidth ? 2 : 8);
1899  			q->avg_window_bytes = 0;
1900  			q->avg_window_begin = now;
1901  
1902  			if (ktime_after(now,
1903  					ktime_add_ms(q->last_reconfig_time,
1904  						     250))) {
1905  				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1906  				cake_reconfigure(sch);
1907  			}
1908  		}
1909  	} else {
1910  		q->avg_window_bytes = 0;
1911  		q->last_packet_time = now;
1912  	}
1913  
1914  	/* flowchain */
1915  	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1916  		if (!flow->set) {
1917  			list_add_tail(&flow->flowchain, &b->new_flows);
1918  		} else {
1919  			b->decaying_flow_count--;
1920  			list_move_tail(&flow->flowchain, &b->new_flows);
1921  		}
1922  		flow->set = CAKE_SET_SPARSE;
1923  		b->sparse_flow_count++;
1924  
1925  		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1926  	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1927  		/* this flow was empty, accounted as a sparse flow, but actually
1928  		 * in the bulk rotation.
1929  		 */
1930  		flow->set = CAKE_SET_BULK;
1931  		b->sparse_flow_count--;
1932  		b->bulk_flow_count++;
1933  
1934  		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1935  		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1936  	}
1937  
1938  	if (q->buffer_used > q->buffer_max_used)
1939  		q->buffer_max_used = q->buffer_used;
1940  
1941  	if (q->buffer_used > q->buffer_limit) {
1942  		u32 dropped = 0;
1943  
1944  		while (q->buffer_used > q->buffer_limit) {
1945  			dropped++;
1946  			cake_drop(sch, to_free);
1947  		}
1948  		b->drop_overlimit += dropped;
1949  	}
1950  	return NET_XMIT_SUCCESS;
1951  }
1952  
cake_dequeue_one(struct Qdisc * sch)1953  static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1954  {
1955  	struct cake_sched_data *q = qdisc_priv(sch);
1956  	struct cake_tin_data *b = &q->tins[q->cur_tin];
1957  	struct cake_flow *flow = &b->flows[q->cur_flow];
1958  	struct sk_buff *skb = NULL;
1959  	u32 len;
1960  
1961  	if (flow->head) {
1962  		skb = dequeue_head(flow);
1963  		len = qdisc_pkt_len(skb);
1964  		b->backlogs[q->cur_flow] -= len;
1965  		b->tin_backlog		 -= len;
1966  		sch->qstats.backlog      -= len;
1967  		q->buffer_used		 -= skb->truesize;
1968  		sch->q.qlen--;
1969  
1970  		if (q->overflow_timeout)
1971  			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1972  	}
1973  	return skb;
1974  }
1975  
1976  /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1977  static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1978  {
1979  	struct cake_sched_data *q = qdisc_priv(sch);
1980  	struct sk_buff *skb;
1981  
1982  	q->cur_tin = tin;
1983  	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1984  		while (!!(skb = cake_dequeue_one(sch)))
1985  			kfree_skb(skb);
1986  }
1987  
cake_dequeue(struct Qdisc * sch)1988  static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1989  {
1990  	struct cake_sched_data *q = qdisc_priv(sch);
1991  	struct cake_tin_data *b = &q->tins[q->cur_tin];
1992  	ktime_t now = ktime_get();
1993  	struct cake_flow *flow;
1994  	struct list_head *head;
1995  	bool first_flow = true;
1996  	struct sk_buff *skb;
1997  	u64 delay;
1998  	u32 len;
1999  
2000  begin:
2001  	if (!sch->q.qlen)
2002  		return NULL;
2003  
2004  	/* global hard shaper */
2005  	if (ktime_after(q->time_next_packet, now) &&
2006  	    ktime_after(q->failsafe_next_packet, now)) {
2007  		u64 next = min(ktime_to_ns(q->time_next_packet),
2008  			       ktime_to_ns(q->failsafe_next_packet));
2009  
2010  		sch->qstats.overlimits++;
2011  		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2012  		return NULL;
2013  	}
2014  
2015  	/* Choose a class to work on. */
2016  	if (!q->rate_ns) {
2017  		/* In unlimited mode, can't rely on shaper timings, just balance
2018  		 * with DRR
2019  		 */
2020  		bool wrapped = false, empty = true;
2021  
2022  		while (b->tin_deficit < 0 ||
2023  		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2024  			if (b->tin_deficit <= 0)
2025  				b->tin_deficit += b->tin_quantum;
2026  			if (b->sparse_flow_count + b->bulk_flow_count)
2027  				empty = false;
2028  
2029  			q->cur_tin++;
2030  			b++;
2031  			if (q->cur_tin >= q->tin_cnt) {
2032  				q->cur_tin = 0;
2033  				b = q->tins;
2034  
2035  				if (wrapped) {
2036  					/* It's possible for q->qlen to be
2037  					 * nonzero when we actually have no
2038  					 * packets anywhere.
2039  					 */
2040  					if (empty)
2041  						return NULL;
2042  				} else {
2043  					wrapped = true;
2044  				}
2045  			}
2046  		}
2047  	} else {
2048  		/* In shaped mode, choose:
2049  		 * - Highest-priority tin with queue and meeting schedule, or
2050  		 * - The earliest-scheduled tin with queue.
2051  		 */
2052  		ktime_t best_time = KTIME_MAX;
2053  		int tin, best_tin = 0;
2054  
2055  		for (tin = 0; tin < q->tin_cnt; tin++) {
2056  			b = q->tins + tin;
2057  			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2058  				ktime_t time_to_pkt = \
2059  					ktime_sub(b->time_next_packet, now);
2060  
2061  				if (ktime_to_ns(time_to_pkt) <= 0 ||
2062  				    ktime_compare(time_to_pkt,
2063  						  best_time) <= 0) {
2064  					best_time = time_to_pkt;
2065  					best_tin = tin;
2066  				}
2067  			}
2068  		}
2069  
2070  		q->cur_tin = best_tin;
2071  		b = q->tins + best_tin;
2072  
2073  		/* No point in going further if no packets to deliver. */
2074  		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2075  			return NULL;
2076  	}
2077  
2078  retry:
2079  	/* service this class */
2080  	head = &b->decaying_flows;
2081  	if (!first_flow || list_empty(head)) {
2082  		head = &b->new_flows;
2083  		if (list_empty(head)) {
2084  			head = &b->old_flows;
2085  			if (unlikely(list_empty(head))) {
2086  				head = &b->decaying_flows;
2087  				if (unlikely(list_empty(head)))
2088  					goto begin;
2089  			}
2090  		}
2091  	}
2092  	flow = list_first_entry(head, struct cake_flow, flowchain);
2093  	q->cur_flow = flow - b->flows;
2094  	first_flow = false;
2095  
2096  	/* flow isolation (DRR++) */
2097  	if (flow->deficit <= 0) {
2098  		/* Keep all flows with deficits out of the sparse and decaying
2099  		 * rotations.  No non-empty flow can go into the decaying
2100  		 * rotation, so they can't get deficits
2101  		 */
2102  		if (flow->set == CAKE_SET_SPARSE) {
2103  			if (flow->head) {
2104  				b->sparse_flow_count--;
2105  				b->bulk_flow_count++;
2106  
2107  				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2108  				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2109  
2110  				flow->set = CAKE_SET_BULK;
2111  			} else {
2112  				/* we've moved it to the bulk rotation for
2113  				 * correct deficit accounting but we still want
2114  				 * to count it as a sparse flow, not a bulk one.
2115  				 */
2116  				flow->set = CAKE_SET_SPARSE_WAIT;
2117  			}
2118  		}
2119  
2120  		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2121  		list_move_tail(&flow->flowchain, &b->old_flows);
2122  
2123  		goto retry;
2124  	}
2125  
2126  	/* Retrieve a packet via the AQM */
2127  	while (1) {
2128  		skb = cake_dequeue_one(sch);
2129  		if (!skb) {
2130  			/* this queue was actually empty */
2131  			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2132  				b->unresponsive_flow_count--;
2133  
2134  			if (flow->cvars.p_drop || flow->cvars.count ||
2135  			    ktime_before(now, flow->cvars.drop_next)) {
2136  				/* keep in the flowchain until the state has
2137  				 * decayed to rest
2138  				 */
2139  				list_move_tail(&flow->flowchain,
2140  					       &b->decaying_flows);
2141  				if (flow->set == CAKE_SET_BULK) {
2142  					b->bulk_flow_count--;
2143  
2144  					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2145  					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2146  
2147  					b->decaying_flow_count++;
2148  				} else if (flow->set == CAKE_SET_SPARSE ||
2149  					   flow->set == CAKE_SET_SPARSE_WAIT) {
2150  					b->sparse_flow_count--;
2151  					b->decaying_flow_count++;
2152  				}
2153  				flow->set = CAKE_SET_DECAYING;
2154  			} else {
2155  				/* remove empty queue from the flowchain */
2156  				list_del_init(&flow->flowchain);
2157  				if (flow->set == CAKE_SET_SPARSE ||
2158  				    flow->set == CAKE_SET_SPARSE_WAIT)
2159  					b->sparse_flow_count--;
2160  				else if (flow->set == CAKE_SET_BULK) {
2161  					b->bulk_flow_count--;
2162  
2163  					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2164  					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2165  				} else
2166  					b->decaying_flow_count--;
2167  
2168  				flow->set = CAKE_SET_NONE;
2169  			}
2170  			goto begin;
2171  		}
2172  
2173  		/* Last packet in queue may be marked, shouldn't be dropped */
2174  		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2175  					(b->bulk_flow_count *
2176  					 !!(q->rate_flags &
2177  					    CAKE_FLAG_INGRESS))) ||
2178  		    !flow->head)
2179  			break;
2180  
2181  		/* drop this packet, get another one */
2182  		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2183  			len = cake_advance_shaper(q, b, skb,
2184  						  now, true);
2185  			flow->deficit -= len;
2186  			b->tin_deficit -= len;
2187  		}
2188  		flow->dropped++;
2189  		b->tin_dropped++;
2190  		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2191  		qdisc_qstats_drop(sch);
2192  		kfree_skb(skb);
2193  		if (q->rate_flags & CAKE_FLAG_INGRESS)
2194  			goto retry;
2195  	}
2196  
2197  	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2198  	qdisc_bstats_update(sch, skb);
2199  
2200  	/* collect delay stats */
2201  	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2202  	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2203  	b->peak_delay = cake_ewma(b->peak_delay, delay,
2204  				  delay > b->peak_delay ? 2 : 8);
2205  	b->base_delay = cake_ewma(b->base_delay, delay,
2206  				  delay < b->base_delay ? 2 : 8);
2207  
2208  	len = cake_advance_shaper(q, b, skb, now, false);
2209  	flow->deficit -= len;
2210  	b->tin_deficit -= len;
2211  
2212  	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2213  		u64 next = min(ktime_to_ns(q->time_next_packet),
2214  			       ktime_to_ns(q->failsafe_next_packet));
2215  
2216  		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2217  	} else if (!sch->q.qlen) {
2218  		int i;
2219  
2220  		for (i = 0; i < q->tin_cnt; i++) {
2221  			if (q->tins[i].decaying_flow_count) {
2222  				ktime_t next = \
2223  					ktime_add_ns(now,
2224  						     q->tins[i].cparams.target);
2225  
2226  				qdisc_watchdog_schedule_ns(&q->watchdog,
2227  							   ktime_to_ns(next));
2228  				break;
2229  			}
2230  		}
2231  	}
2232  
2233  	if (q->overflow_timeout)
2234  		q->overflow_timeout--;
2235  
2236  	return skb;
2237  }
2238  
cake_reset(struct Qdisc * sch)2239  static void cake_reset(struct Qdisc *sch)
2240  {
2241  	struct cake_sched_data *q = qdisc_priv(sch);
2242  	u32 c;
2243  
2244  	if (!q->tins)
2245  		return;
2246  
2247  	for (c = 0; c < CAKE_MAX_TINS; c++)
2248  		cake_clear_tin(sch, c);
2249  }
2250  
2251  static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2252  	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2253  	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2254  	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2255  	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2256  	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2257  	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2258  	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2259  	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2260  	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2261  	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2262  	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2263  	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2264  	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2265  	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2266  	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2267  	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2268  	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2269  };
2270  
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2271  static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2272  			  u64 target_ns, u64 rtt_est_ns)
2273  {
2274  	/* convert byte-rate into time-per-byte
2275  	 * so it will always unwedge in reasonable time.
2276  	 */
2277  	static const u64 MIN_RATE = 64;
2278  	u32 byte_target = mtu;
2279  	u64 byte_target_ns;
2280  	u8  rate_shft = 0;
2281  	u64 rate_ns = 0;
2282  
2283  	b->flow_quantum = 1514;
2284  	if (rate) {
2285  		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2286  		rate_shft = 34;
2287  		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2288  		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2289  		while (!!(rate_ns >> 34)) {
2290  			rate_ns >>= 1;
2291  			rate_shft--;
2292  		}
2293  	} /* else unlimited, ie. zero delay */
2294  
2295  	b->tin_rate_bps  = rate;
2296  	b->tin_rate_ns   = rate_ns;
2297  	b->tin_rate_shft = rate_shft;
2298  
2299  	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2300  
2301  	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2302  	b->cparams.interval = max(rtt_est_ns +
2303  				     b->cparams.target - target_ns,
2304  				     b->cparams.target * 2);
2305  	b->cparams.mtu_time = byte_target_ns;
2306  	b->cparams.p_inc = 1 << 24; /* 1/256 */
2307  	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2308  }
2309  
cake_config_besteffort(struct Qdisc * sch)2310  static int cake_config_besteffort(struct Qdisc *sch)
2311  {
2312  	struct cake_sched_data *q = qdisc_priv(sch);
2313  	struct cake_tin_data *b = &q->tins[0];
2314  	u32 mtu = psched_mtu(qdisc_dev(sch));
2315  	u64 rate = q->rate_bps;
2316  
2317  	q->tin_cnt = 1;
2318  
2319  	q->tin_index = besteffort;
2320  	q->tin_order = normal_order;
2321  
2322  	cake_set_rate(b, rate, mtu,
2323  		      us_to_ns(q->target), us_to_ns(q->interval));
2324  	b->tin_quantum = 65535;
2325  
2326  	return 0;
2327  }
2328  
cake_config_precedence(struct Qdisc * sch)2329  static int cake_config_precedence(struct Qdisc *sch)
2330  {
2331  	/* convert high-level (user visible) parameters into internal format */
2332  	struct cake_sched_data *q = qdisc_priv(sch);
2333  	u32 mtu = psched_mtu(qdisc_dev(sch));
2334  	u64 rate = q->rate_bps;
2335  	u32 quantum = 256;
2336  	u32 i;
2337  
2338  	q->tin_cnt = 8;
2339  	q->tin_index = precedence;
2340  	q->tin_order = normal_order;
2341  
2342  	for (i = 0; i < q->tin_cnt; i++) {
2343  		struct cake_tin_data *b = &q->tins[i];
2344  
2345  		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2346  			      us_to_ns(q->interval));
2347  
2348  		b->tin_quantum = max_t(u16, 1U, quantum);
2349  
2350  		/* calculate next class's parameters */
2351  		rate  *= 7;
2352  		rate >>= 3;
2353  
2354  		quantum  *= 7;
2355  		quantum >>= 3;
2356  	}
2357  
2358  	return 0;
2359  }
2360  
2361  /*	List of known Diffserv codepoints:
2362   *
2363   *	Default Forwarding (DF/CS0) - Best Effort
2364   *	Max Throughput (TOS2)
2365   *	Min Delay (TOS4)
2366   *	LLT "La" (TOS5)
2367   *	Assured Forwarding 1 (AF1x) - x3
2368   *	Assured Forwarding 2 (AF2x) - x3
2369   *	Assured Forwarding 3 (AF3x) - x3
2370   *	Assured Forwarding 4 (AF4x) - x3
2371   *	Precedence Class 1 (CS1)
2372   *	Precedence Class 2 (CS2)
2373   *	Precedence Class 3 (CS3)
2374   *	Precedence Class 4 (CS4)
2375   *	Precedence Class 5 (CS5)
2376   *	Precedence Class 6 (CS6)
2377   *	Precedence Class 7 (CS7)
2378   *	Voice Admit (VA)
2379   *	Expedited Forwarding (EF)
2380   *	Lower Effort (LE)
2381   *
2382   *	Total 26 codepoints.
2383   */
2384  
2385  /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2386   *		(roughly descending order of contended priority)
2387   *		(roughly ascending order of uncontended throughput)
2388   *
2389   *	Network Control (CS6,CS7)      - routing traffic
2390   *	Telephony (EF,VA)         - aka. VoIP streams
2391   *	Signalling (CS5)               - VoIP setup
2392   *	Multimedia Conferencing (AF4x) - aka. video calls
2393   *	Realtime Interactive (CS4)     - eg. games
2394   *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2395   *	Broadcast Video (CS3)
2396   *	Low-Latency Data (AF2x,TOS4)      - eg. database
2397   *	Ops, Admin, Management (CS2)      - eg. ssh
2398   *	Standard Service (DF & unrecognised codepoints)
2399   *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2400   *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2401   *
2402   *	Total 12 traffic classes.
2403   */
2404  
cake_config_diffserv8(struct Qdisc * sch)2405  static int cake_config_diffserv8(struct Qdisc *sch)
2406  {
2407  /*	Pruned list of traffic classes for typical applications:
2408   *
2409   *		Network Control          (CS6, CS7)
2410   *		Minimum Latency          (EF, VA, CS5, CS4)
2411   *		Interactive Shell        (CS2)
2412   *		Low Latency Transactions (AF2x, TOS4)
2413   *		Video Streaming          (AF4x, AF3x, CS3)
2414   *		Bog Standard             (DF etc.)
2415   *		High Throughput          (AF1x, TOS2, CS1)
2416   *		Background Traffic       (LE)
2417   *
2418   *		Total 8 traffic classes.
2419   */
2420  
2421  	struct cake_sched_data *q = qdisc_priv(sch);
2422  	u32 mtu = psched_mtu(qdisc_dev(sch));
2423  	u64 rate = q->rate_bps;
2424  	u32 quantum = 256;
2425  	u32 i;
2426  
2427  	q->tin_cnt = 8;
2428  
2429  	/* codepoint to class mapping */
2430  	q->tin_index = diffserv8;
2431  	q->tin_order = normal_order;
2432  
2433  	/* class characteristics */
2434  	for (i = 0; i < q->tin_cnt; i++) {
2435  		struct cake_tin_data *b = &q->tins[i];
2436  
2437  		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2438  			      us_to_ns(q->interval));
2439  
2440  		b->tin_quantum = max_t(u16, 1U, quantum);
2441  
2442  		/* calculate next class's parameters */
2443  		rate  *= 7;
2444  		rate >>= 3;
2445  
2446  		quantum  *= 7;
2447  		quantum >>= 3;
2448  	}
2449  
2450  	return 0;
2451  }
2452  
cake_config_diffserv4(struct Qdisc * sch)2453  static int cake_config_diffserv4(struct Qdisc *sch)
2454  {
2455  /*  Further pruned list of traffic classes for four-class system:
2456   *
2457   *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2458   *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2459   *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2460   *	    Background Traffic (LE, CS1)
2461   *
2462   *		Total 4 traffic classes.
2463   */
2464  
2465  	struct cake_sched_data *q = qdisc_priv(sch);
2466  	u32 mtu = psched_mtu(qdisc_dev(sch));
2467  	u64 rate = q->rate_bps;
2468  	u32 quantum = 1024;
2469  
2470  	q->tin_cnt = 4;
2471  
2472  	/* codepoint to class mapping */
2473  	q->tin_index = diffserv4;
2474  	q->tin_order = bulk_order;
2475  
2476  	/* class characteristics */
2477  	cake_set_rate(&q->tins[0], rate, mtu,
2478  		      us_to_ns(q->target), us_to_ns(q->interval));
2479  	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2480  		      us_to_ns(q->target), us_to_ns(q->interval));
2481  	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2482  		      us_to_ns(q->target), us_to_ns(q->interval));
2483  	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2484  		      us_to_ns(q->target), us_to_ns(q->interval));
2485  
2486  	/* bandwidth-sharing weights */
2487  	q->tins[0].tin_quantum = quantum;
2488  	q->tins[1].tin_quantum = quantum >> 4;
2489  	q->tins[2].tin_quantum = quantum >> 1;
2490  	q->tins[3].tin_quantum = quantum >> 2;
2491  
2492  	return 0;
2493  }
2494  
cake_config_diffserv3(struct Qdisc * sch)2495  static int cake_config_diffserv3(struct Qdisc *sch)
2496  {
2497  /*  Simplified Diffserv structure with 3 tins.
2498   *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2499   *		Best Effort
2500   *		Low Priority		(LE, CS1)
2501   */
2502  	struct cake_sched_data *q = qdisc_priv(sch);
2503  	u32 mtu = psched_mtu(qdisc_dev(sch));
2504  	u64 rate = q->rate_bps;
2505  	u32 quantum = 1024;
2506  
2507  	q->tin_cnt = 3;
2508  
2509  	/* codepoint to class mapping */
2510  	q->tin_index = diffserv3;
2511  	q->tin_order = bulk_order;
2512  
2513  	/* class characteristics */
2514  	cake_set_rate(&q->tins[0], rate, mtu,
2515  		      us_to_ns(q->target), us_to_ns(q->interval));
2516  	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2517  		      us_to_ns(q->target), us_to_ns(q->interval));
2518  	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2519  		      us_to_ns(q->target), us_to_ns(q->interval));
2520  
2521  	/* bandwidth-sharing weights */
2522  	q->tins[0].tin_quantum = quantum;
2523  	q->tins[1].tin_quantum = quantum >> 4;
2524  	q->tins[2].tin_quantum = quantum >> 2;
2525  
2526  	return 0;
2527  }
2528  
cake_reconfigure(struct Qdisc * sch)2529  static void cake_reconfigure(struct Qdisc *sch)
2530  {
2531  	struct cake_sched_data *q = qdisc_priv(sch);
2532  	int c, ft;
2533  
2534  	switch (q->tin_mode) {
2535  	case CAKE_DIFFSERV_BESTEFFORT:
2536  		ft = cake_config_besteffort(sch);
2537  		break;
2538  
2539  	case CAKE_DIFFSERV_PRECEDENCE:
2540  		ft = cake_config_precedence(sch);
2541  		break;
2542  
2543  	case CAKE_DIFFSERV_DIFFSERV8:
2544  		ft = cake_config_diffserv8(sch);
2545  		break;
2546  
2547  	case CAKE_DIFFSERV_DIFFSERV4:
2548  		ft = cake_config_diffserv4(sch);
2549  		break;
2550  
2551  	case CAKE_DIFFSERV_DIFFSERV3:
2552  	default:
2553  		ft = cake_config_diffserv3(sch);
2554  		break;
2555  	}
2556  
2557  	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2558  		cake_clear_tin(sch, c);
2559  		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2560  	}
2561  
2562  	q->rate_ns   = q->tins[ft].tin_rate_ns;
2563  	q->rate_shft = q->tins[ft].tin_rate_shft;
2564  
2565  	if (q->buffer_config_limit) {
2566  		q->buffer_limit = q->buffer_config_limit;
2567  	} else if (q->rate_bps) {
2568  		u64 t = q->rate_bps * q->interval;
2569  
2570  		do_div(t, USEC_PER_SEC / 4);
2571  		q->buffer_limit = max_t(u32, t, 4U << 20);
2572  	} else {
2573  		q->buffer_limit = ~0;
2574  	}
2575  
2576  	sch->flags &= ~TCQ_F_CAN_BYPASS;
2577  
2578  	q->buffer_limit = min(q->buffer_limit,
2579  			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2580  				  q->buffer_config_limit));
2581  }
2582  
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2583  static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2584  		       struct netlink_ext_ack *extack)
2585  {
2586  	struct cake_sched_data *q = qdisc_priv(sch);
2587  	struct nlattr *tb[TCA_CAKE_MAX + 1];
2588  	int err;
2589  
2590  	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2591  					  extack);
2592  	if (err < 0)
2593  		return err;
2594  
2595  	if (tb[TCA_CAKE_NAT]) {
2596  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2597  		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2598  		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2599  			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2600  #else
2601  		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2602  				    "No conntrack support in kernel");
2603  		return -EOPNOTSUPP;
2604  #endif
2605  	}
2606  
2607  	if (tb[TCA_CAKE_BASE_RATE64])
2608  		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2609  
2610  	if (tb[TCA_CAKE_DIFFSERV_MODE])
2611  		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2612  
2613  	if (tb[TCA_CAKE_WASH]) {
2614  		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2615  			q->rate_flags |= CAKE_FLAG_WASH;
2616  		else
2617  			q->rate_flags &= ~CAKE_FLAG_WASH;
2618  	}
2619  
2620  	if (tb[TCA_CAKE_FLOW_MODE])
2621  		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2622  				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2623  					CAKE_FLOW_MASK));
2624  
2625  	if (tb[TCA_CAKE_ATM])
2626  		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2627  
2628  	if (tb[TCA_CAKE_OVERHEAD]) {
2629  		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2630  		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2631  
2632  		q->max_netlen = 0;
2633  		q->max_adjlen = 0;
2634  		q->min_netlen = ~0;
2635  		q->min_adjlen = ~0;
2636  	}
2637  
2638  	if (tb[TCA_CAKE_RAW]) {
2639  		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2640  
2641  		q->max_netlen = 0;
2642  		q->max_adjlen = 0;
2643  		q->min_netlen = ~0;
2644  		q->min_adjlen = ~0;
2645  	}
2646  
2647  	if (tb[TCA_CAKE_MPU])
2648  		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2649  
2650  	if (tb[TCA_CAKE_RTT]) {
2651  		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2652  
2653  		if (!q->interval)
2654  			q->interval = 1;
2655  	}
2656  
2657  	if (tb[TCA_CAKE_TARGET]) {
2658  		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2659  
2660  		if (!q->target)
2661  			q->target = 1;
2662  	}
2663  
2664  	if (tb[TCA_CAKE_AUTORATE]) {
2665  		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2666  			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2667  		else
2668  			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2669  	}
2670  
2671  	if (tb[TCA_CAKE_INGRESS]) {
2672  		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2673  			q->rate_flags |= CAKE_FLAG_INGRESS;
2674  		else
2675  			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2676  	}
2677  
2678  	if (tb[TCA_CAKE_ACK_FILTER])
2679  		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2680  
2681  	if (tb[TCA_CAKE_MEMORY])
2682  		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2683  
2684  	if (tb[TCA_CAKE_SPLIT_GSO]) {
2685  		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2686  			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2687  		else
2688  			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2689  	}
2690  
2691  	if (tb[TCA_CAKE_FWMARK]) {
2692  		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2693  		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2694  	}
2695  
2696  	if (q->tins) {
2697  		sch_tree_lock(sch);
2698  		cake_reconfigure(sch);
2699  		sch_tree_unlock(sch);
2700  	}
2701  
2702  	return 0;
2703  }
2704  
cake_destroy(struct Qdisc * sch)2705  static void cake_destroy(struct Qdisc *sch)
2706  {
2707  	struct cake_sched_data *q = qdisc_priv(sch);
2708  
2709  	qdisc_watchdog_cancel(&q->watchdog);
2710  	tcf_block_put(q->block);
2711  	kvfree(q->tins);
2712  }
2713  
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2714  static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2715  		     struct netlink_ext_ack *extack)
2716  {
2717  	struct cake_sched_data *q = qdisc_priv(sch);
2718  	int i, j, err;
2719  
2720  	sch->limit = 10240;
2721  	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2722  	q->flow_mode  = CAKE_FLOW_TRIPLE;
2723  
2724  	q->rate_bps = 0; /* unlimited by default */
2725  
2726  	q->interval = 100000; /* 100ms default */
2727  	q->target   =   5000; /* 5ms: codel RFC argues
2728  			       * for 5 to 10% of interval
2729  			       */
2730  	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2731  	q->cur_tin = 0;
2732  	q->cur_flow  = 0;
2733  
2734  	qdisc_watchdog_init(&q->watchdog, sch);
2735  
2736  	if (opt) {
2737  		err = cake_change(sch, opt, extack);
2738  
2739  		if (err)
2740  			return err;
2741  	}
2742  
2743  	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2744  	if (err)
2745  		return err;
2746  
2747  	quantum_div[0] = ~0;
2748  	for (i = 1; i <= CAKE_QUEUES; i++)
2749  		quantum_div[i] = 65535 / i;
2750  
2751  	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2752  			   GFP_KERNEL);
2753  	if (!q->tins)
2754  		return -ENOMEM;
2755  
2756  	for (i = 0; i < CAKE_MAX_TINS; i++) {
2757  		struct cake_tin_data *b = q->tins + i;
2758  
2759  		INIT_LIST_HEAD(&b->new_flows);
2760  		INIT_LIST_HEAD(&b->old_flows);
2761  		INIT_LIST_HEAD(&b->decaying_flows);
2762  		b->sparse_flow_count = 0;
2763  		b->bulk_flow_count = 0;
2764  		b->decaying_flow_count = 0;
2765  
2766  		for (j = 0; j < CAKE_QUEUES; j++) {
2767  			struct cake_flow *flow = b->flows + j;
2768  			u32 k = j * CAKE_MAX_TINS + i;
2769  
2770  			INIT_LIST_HEAD(&flow->flowchain);
2771  			cobalt_vars_init(&flow->cvars);
2772  
2773  			q->overflow_heap[k].t = i;
2774  			q->overflow_heap[k].b = j;
2775  			b->overflow_idx[j] = k;
2776  		}
2777  	}
2778  
2779  	cake_reconfigure(sch);
2780  	q->avg_peak_bandwidth = q->rate_bps;
2781  	q->min_netlen = ~0;
2782  	q->min_adjlen = ~0;
2783  	return 0;
2784  }
2785  
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2786  static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2787  {
2788  	struct cake_sched_data *q = qdisc_priv(sch);
2789  	struct nlattr *opts;
2790  
2791  	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2792  	if (!opts)
2793  		goto nla_put_failure;
2794  
2795  	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2796  			      TCA_CAKE_PAD))
2797  		goto nla_put_failure;
2798  
2799  	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2800  			q->flow_mode & CAKE_FLOW_MASK))
2801  		goto nla_put_failure;
2802  
2803  	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2804  		goto nla_put_failure;
2805  
2806  	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2807  		goto nla_put_failure;
2808  
2809  	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2810  		goto nla_put_failure;
2811  
2812  	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2813  			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2814  		goto nla_put_failure;
2815  
2816  	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2817  			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2818  		goto nla_put_failure;
2819  
2820  	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2821  		goto nla_put_failure;
2822  
2823  	if (nla_put_u32(skb, TCA_CAKE_NAT,
2824  			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2825  		goto nla_put_failure;
2826  
2827  	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2828  		goto nla_put_failure;
2829  
2830  	if (nla_put_u32(skb, TCA_CAKE_WASH,
2831  			!!(q->rate_flags & CAKE_FLAG_WASH)))
2832  		goto nla_put_failure;
2833  
2834  	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2835  		goto nla_put_failure;
2836  
2837  	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2838  		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2839  			goto nla_put_failure;
2840  
2841  	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2842  		goto nla_put_failure;
2843  
2844  	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2845  		goto nla_put_failure;
2846  
2847  	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2848  			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2849  		goto nla_put_failure;
2850  
2851  	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2852  		goto nla_put_failure;
2853  
2854  	return nla_nest_end(skb, opts);
2855  
2856  nla_put_failure:
2857  	return -1;
2858  }
2859  
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2860  static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2861  {
2862  	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2863  	struct cake_sched_data *q = qdisc_priv(sch);
2864  	struct nlattr *tstats, *ts;
2865  	int i;
2866  
2867  	if (!stats)
2868  		return -1;
2869  
2870  #define PUT_STAT_U32(attr, data) do {				       \
2871  		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2872  			goto nla_put_failure;			       \
2873  	} while (0)
2874  #define PUT_STAT_U64(attr, data) do {				       \
2875  		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2876  					data, TCA_CAKE_STATS_PAD)) \
2877  			goto nla_put_failure;			       \
2878  	} while (0)
2879  
2880  	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2881  	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2882  	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2883  	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2884  	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2885  	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2886  	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2887  	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2888  
2889  #undef PUT_STAT_U32
2890  #undef PUT_STAT_U64
2891  
2892  	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2893  	if (!tstats)
2894  		goto nla_put_failure;
2895  
2896  #define PUT_TSTAT_U32(attr, data) do {					\
2897  		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2898  			goto nla_put_failure;				\
2899  	} while (0)
2900  #define PUT_TSTAT_U64(attr, data) do {					\
2901  		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2902  					data, TCA_CAKE_TIN_STATS_PAD))	\
2903  			goto nla_put_failure;				\
2904  	} while (0)
2905  
2906  	for (i = 0; i < q->tin_cnt; i++) {
2907  		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2908  
2909  		ts = nla_nest_start_noflag(d->skb, i + 1);
2910  		if (!ts)
2911  			goto nla_put_failure;
2912  
2913  		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2914  		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2915  		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2916  
2917  		PUT_TSTAT_U32(TARGET_US,
2918  			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2919  		PUT_TSTAT_U32(INTERVAL_US,
2920  			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2921  
2922  		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2923  		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2924  		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2925  		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2926  
2927  		PUT_TSTAT_U32(PEAK_DELAY_US,
2928  			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2929  		PUT_TSTAT_U32(AVG_DELAY_US,
2930  			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2931  		PUT_TSTAT_U32(BASE_DELAY_US,
2932  			      ktime_to_us(ns_to_ktime(b->base_delay)));
2933  
2934  		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2935  		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2936  		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2937  
2938  		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2939  					    b->decaying_flow_count);
2940  		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2941  		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2942  		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2943  
2944  		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2945  		nla_nest_end(d->skb, ts);
2946  	}
2947  
2948  #undef PUT_TSTAT_U32
2949  #undef PUT_TSTAT_U64
2950  
2951  	nla_nest_end(d->skb, tstats);
2952  	return nla_nest_end(d->skb, stats);
2953  
2954  nla_put_failure:
2955  	nla_nest_cancel(d->skb, stats);
2956  	return -1;
2957  }
2958  
cake_leaf(struct Qdisc * sch,unsigned long arg)2959  static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2960  {
2961  	return NULL;
2962  }
2963  
cake_find(struct Qdisc * sch,u32 classid)2964  static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2965  {
2966  	return 0;
2967  }
2968  
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2969  static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2970  			       u32 classid)
2971  {
2972  	return 0;
2973  }
2974  
cake_unbind(struct Qdisc * q,unsigned long cl)2975  static void cake_unbind(struct Qdisc *q, unsigned long cl)
2976  {
2977  }
2978  
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2979  static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2980  					struct netlink_ext_ack *extack)
2981  {
2982  	struct cake_sched_data *q = qdisc_priv(sch);
2983  
2984  	if (cl)
2985  		return NULL;
2986  	return q->block;
2987  }
2988  
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2989  static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2990  			   struct sk_buff *skb, struct tcmsg *tcm)
2991  {
2992  	tcm->tcm_handle |= TC_H_MIN(cl);
2993  	return 0;
2994  }
2995  
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2996  static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2997  				 struct gnet_dump *d)
2998  {
2999  	struct cake_sched_data *q = qdisc_priv(sch);
3000  	const struct cake_flow *flow = NULL;
3001  	struct gnet_stats_queue qs = { 0 };
3002  	struct nlattr *stats;
3003  	u32 idx = cl - 1;
3004  
3005  	if (idx < CAKE_QUEUES * q->tin_cnt) {
3006  		const struct cake_tin_data *b = \
3007  			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3008  		const struct sk_buff *skb;
3009  
3010  		flow = &b->flows[idx % CAKE_QUEUES];
3011  
3012  		if (flow->head) {
3013  			sch_tree_lock(sch);
3014  			skb = flow->head;
3015  			while (skb) {
3016  				qs.qlen++;
3017  				skb = skb->next;
3018  			}
3019  			sch_tree_unlock(sch);
3020  		}
3021  		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3022  		qs.drops = flow->dropped;
3023  	}
3024  	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3025  		return -1;
3026  	if (flow) {
3027  		ktime_t now = ktime_get();
3028  
3029  		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3030  		if (!stats)
3031  			return -1;
3032  
3033  #define PUT_STAT_U32(attr, data) do {				       \
3034  		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3035  			goto nla_put_failure;			       \
3036  	} while (0)
3037  #define PUT_STAT_S32(attr, data) do {				       \
3038  		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3039  			goto nla_put_failure;			       \
3040  	} while (0)
3041  
3042  		PUT_STAT_S32(DEFICIT, flow->deficit);
3043  		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3044  		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3045  		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3046  		if (flow->cvars.p_drop) {
3047  			PUT_STAT_S32(BLUE_TIMER_US,
3048  				     ktime_to_us(
3049  					     ktime_sub(now,
3050  						       flow->cvars.blue_timer)));
3051  		}
3052  		if (flow->cvars.dropping) {
3053  			PUT_STAT_S32(DROP_NEXT_US,
3054  				     ktime_to_us(
3055  					     ktime_sub(now,
3056  						       flow->cvars.drop_next)));
3057  		}
3058  
3059  		if (nla_nest_end(d->skb, stats) < 0)
3060  			return -1;
3061  	}
3062  
3063  	return 0;
3064  
3065  nla_put_failure:
3066  	nla_nest_cancel(d->skb, stats);
3067  	return -1;
3068  }
3069  
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3070  static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3071  {
3072  	struct cake_sched_data *q = qdisc_priv(sch);
3073  	unsigned int i, j;
3074  
3075  	if (arg->stop)
3076  		return;
3077  
3078  	for (i = 0; i < q->tin_cnt; i++) {
3079  		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3080  
3081  		for (j = 0; j < CAKE_QUEUES; j++) {
3082  			if (list_empty(&b->flows[j].flowchain)) {
3083  				arg->count++;
3084  				continue;
3085  			}
3086  			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3087  						 arg))
3088  				break;
3089  		}
3090  	}
3091  }
3092  
3093  static const struct Qdisc_class_ops cake_class_ops = {
3094  	.leaf		=	cake_leaf,
3095  	.find		=	cake_find,
3096  	.tcf_block	=	cake_tcf_block,
3097  	.bind_tcf	=	cake_bind,
3098  	.unbind_tcf	=	cake_unbind,
3099  	.dump		=	cake_dump_class,
3100  	.dump_stats	=	cake_dump_class_stats,
3101  	.walk		=	cake_walk,
3102  };
3103  
3104  static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3105  	.cl_ops		=	&cake_class_ops,
3106  	.id		=	"cake",
3107  	.priv_size	=	sizeof(struct cake_sched_data),
3108  	.enqueue	=	cake_enqueue,
3109  	.dequeue	=	cake_dequeue,
3110  	.peek		=	qdisc_peek_dequeued,
3111  	.init		=	cake_init,
3112  	.reset		=	cake_reset,
3113  	.destroy	=	cake_destroy,
3114  	.change		=	cake_change,
3115  	.dump		=	cake_dump,
3116  	.dump_stats	=	cake_dump_stats,
3117  	.owner		=	THIS_MODULE,
3118  };
3119  
cake_module_init(void)3120  static int __init cake_module_init(void)
3121  {
3122  	return register_qdisc(&cake_qdisc_ops);
3123  }
3124  
cake_module_exit(void)3125  static void __exit cake_module_exit(void)
3126  {
3127  	unregister_qdisc(&cake_qdisc_ops);
3128  }
3129  
3130  module_init(cake_module_init)
3131  module_exit(cake_module_exit)
3132  MODULE_AUTHOR("Jonathan Morton");
3133  MODULE_LICENSE("Dual BSD/GPL");
3134  MODULE_DESCRIPTION("The CAKE shaper.");
3135