xref: /openbmc/linux/net/sched/sch_choke.c (revision 8032bf12)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_choke.c	CHOKE scheduler
4  *
5  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7  */
8 
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/skbuff.h>
13 #include <linux/vmalloc.h>
14 #include <net/pkt_sched.h>
15 #include <net/pkt_cls.h>
16 #include <net/inet_ecn.h>
17 #include <net/red.h>
18 #include <net/flow_dissector.h>
19 
20 /*
21    CHOKe stateless AQM for fair bandwidth allocation
22    =================================================
23 
24    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26    maintains no flow state. The difference from RED is an additional step
27    during the enqueuing process. If average queue size is over the
28    low threshold (qmin), a packet is chosen at random from the queue.
29    If both the new and chosen packet are from the same flow, both
30    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31    needs to access packets in queue randomly. It has a minimal class
32    interface to allow overriding the builtin flow classifier with
33    filters.
34 
35    Source:
36    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38    IEEE INFOCOM, 2000.
39 
40    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41    Characteristics", IEEE/ACM Transactions on Networking, 2004
42 
43  */
44 
45 /* Upper bound on size of sk_buff table (packets) */
46 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
47 
48 struct choke_sched_data {
49 /* Parameters */
50 	u32		 limit;
51 	unsigned char	 flags;
52 
53 	struct red_parms parms;
54 
55 /* Variables */
56 	struct red_vars  vars;
57 	struct {
58 		u32	prob_drop;	/* Early probability drops */
59 		u32	prob_mark;	/* Early probability marks */
60 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62 		u32	pdrop;          /* Drops due to queue limits */
63 		u32	matched;	/* Drops to flow match */
64 	} stats;
65 
66 	unsigned int	 head;
67 	unsigned int	 tail;
68 
69 	unsigned int	 tab_mask; /* size - 1 */
70 
71 	struct sk_buff **tab;
72 };
73 
74 /* number of elements in queue including holes */
choke_len(const struct choke_sched_data * q)75 static unsigned int choke_len(const struct choke_sched_data *q)
76 {
77 	return (q->tail - q->head) & q->tab_mask;
78 }
79 
80 /* Is ECN parameter configured */
use_ecn(const struct choke_sched_data * q)81 static int use_ecn(const struct choke_sched_data *q)
82 {
83 	return q->flags & TC_RED_ECN;
84 }
85 
86 /* Should packets over max just be dropped (versus marked) */
use_harddrop(const struct choke_sched_data * q)87 static int use_harddrop(const struct choke_sched_data *q)
88 {
89 	return q->flags & TC_RED_HARDDROP;
90 }
91 
92 /* Move head pointer forward to skip over holes */
choke_zap_head_holes(struct choke_sched_data * q)93 static void choke_zap_head_holes(struct choke_sched_data *q)
94 {
95 	do {
96 		q->head = (q->head + 1) & q->tab_mask;
97 		if (q->head == q->tail)
98 			break;
99 	} while (q->tab[q->head] == NULL);
100 }
101 
102 /* Move tail pointer backwards to reuse holes */
choke_zap_tail_holes(struct choke_sched_data * q)103 static void choke_zap_tail_holes(struct choke_sched_data *q)
104 {
105 	do {
106 		q->tail = (q->tail - 1) & q->tab_mask;
107 		if (q->head == q->tail)
108 			break;
109 	} while (q->tab[q->tail] == NULL);
110 }
111 
112 /* Drop packet from queue array by creating a "hole" */
choke_drop_by_idx(struct Qdisc * sch,unsigned int idx,struct sk_buff ** to_free)113 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
114 			      struct sk_buff **to_free)
115 {
116 	struct choke_sched_data *q = qdisc_priv(sch);
117 	struct sk_buff *skb = q->tab[idx];
118 
119 	q->tab[idx] = NULL;
120 
121 	if (idx == q->head)
122 		choke_zap_head_holes(q);
123 	if (idx == q->tail)
124 		choke_zap_tail_holes(q);
125 
126 	qdisc_qstats_backlog_dec(sch, skb);
127 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
128 	qdisc_drop(skb, sch, to_free);
129 	--sch->q.qlen;
130 }
131 
132 struct choke_skb_cb {
133 	u8			keys_valid;
134 	struct			flow_keys_digest keys;
135 };
136 
choke_skb_cb(const struct sk_buff * skb)137 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
138 {
139 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
140 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
141 }
142 
143 /*
144  * Compare flow of two packets
145  *  Returns true only if source and destination address and port match.
146  *          false for special cases
147  */
choke_match_flow(struct sk_buff * skb1,struct sk_buff * skb2)148 static bool choke_match_flow(struct sk_buff *skb1,
149 			     struct sk_buff *skb2)
150 {
151 	struct flow_keys temp;
152 
153 	if (skb1->protocol != skb2->protocol)
154 		return false;
155 
156 	if (!choke_skb_cb(skb1)->keys_valid) {
157 		choke_skb_cb(skb1)->keys_valid = 1;
158 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
159 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
160 	}
161 
162 	if (!choke_skb_cb(skb2)->keys_valid) {
163 		choke_skb_cb(skb2)->keys_valid = 1;
164 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
165 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
166 	}
167 
168 	return !memcmp(&choke_skb_cb(skb1)->keys,
169 		       &choke_skb_cb(skb2)->keys,
170 		       sizeof(choke_skb_cb(skb1)->keys));
171 }
172 
173 /*
174  * Select a packet at random from queue
175  * HACK: since queue can have holes from previous deletion; retry several
176  *   times to find a random skb but then just give up and return the head
177  * Will return NULL if queue is empty (q->head == q->tail)
178  */
choke_peek_random(const struct choke_sched_data * q,unsigned int * pidx)179 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
180 					 unsigned int *pidx)
181 {
182 	struct sk_buff *skb;
183 	int retrys = 3;
184 
185 	do {
186 		*pidx = (q->head + get_random_u32_below(choke_len(q))) & q->tab_mask;
187 		skb = q->tab[*pidx];
188 		if (skb)
189 			return skb;
190 	} while (--retrys > 0);
191 
192 	return q->tab[*pidx = q->head];
193 }
194 
195 /*
196  * Compare new packet with random packet in queue
197  * returns true if matched and sets *pidx
198  */
choke_match_random(const struct choke_sched_data * q,struct sk_buff * nskb,unsigned int * pidx)199 static bool choke_match_random(const struct choke_sched_data *q,
200 			       struct sk_buff *nskb,
201 			       unsigned int *pidx)
202 {
203 	struct sk_buff *oskb;
204 
205 	if (q->head == q->tail)
206 		return false;
207 
208 	oskb = choke_peek_random(q, pidx);
209 	return choke_match_flow(oskb, nskb);
210 }
211 
choke_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)212 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
213 			 struct sk_buff **to_free)
214 {
215 	struct choke_sched_data *q = qdisc_priv(sch);
216 	const struct red_parms *p = &q->parms;
217 
218 	choke_skb_cb(skb)->keys_valid = 0;
219 	/* Compute average queue usage (see RED) */
220 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
221 	if (red_is_idling(&q->vars))
222 		red_end_of_idle_period(&q->vars);
223 
224 	/* Is queue small? */
225 	if (q->vars.qavg <= p->qth_min)
226 		q->vars.qcount = -1;
227 	else {
228 		unsigned int idx;
229 
230 		/* Draw a packet at random from queue and compare flow */
231 		if (choke_match_random(q, skb, &idx)) {
232 			q->stats.matched++;
233 			choke_drop_by_idx(sch, idx, to_free);
234 			goto congestion_drop;
235 		}
236 
237 		/* Queue is large, always mark/drop */
238 		if (q->vars.qavg > p->qth_max) {
239 			q->vars.qcount = -1;
240 
241 			qdisc_qstats_overlimit(sch);
242 			if (use_harddrop(q) || !use_ecn(q) ||
243 			    !INET_ECN_set_ce(skb)) {
244 				q->stats.forced_drop++;
245 				goto congestion_drop;
246 			}
247 
248 			q->stats.forced_mark++;
249 		} else if (++q->vars.qcount) {
250 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
251 				q->vars.qcount = 0;
252 				q->vars.qR = red_random(p);
253 
254 				qdisc_qstats_overlimit(sch);
255 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
256 					q->stats.prob_drop++;
257 					goto congestion_drop;
258 				}
259 
260 				q->stats.prob_mark++;
261 			}
262 		} else
263 			q->vars.qR = red_random(p);
264 	}
265 
266 	/* Admit new packet */
267 	if (sch->q.qlen < q->limit) {
268 		q->tab[q->tail] = skb;
269 		q->tail = (q->tail + 1) & q->tab_mask;
270 		++sch->q.qlen;
271 		qdisc_qstats_backlog_inc(sch, skb);
272 		return NET_XMIT_SUCCESS;
273 	}
274 
275 	q->stats.pdrop++;
276 	return qdisc_drop(skb, sch, to_free);
277 
278 congestion_drop:
279 	qdisc_drop(skb, sch, to_free);
280 	return NET_XMIT_CN;
281 }
282 
choke_dequeue(struct Qdisc * sch)283 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
284 {
285 	struct choke_sched_data *q = qdisc_priv(sch);
286 	struct sk_buff *skb;
287 
288 	if (q->head == q->tail) {
289 		if (!red_is_idling(&q->vars))
290 			red_start_of_idle_period(&q->vars);
291 		return NULL;
292 	}
293 
294 	skb = q->tab[q->head];
295 	q->tab[q->head] = NULL;
296 	choke_zap_head_holes(q);
297 	--sch->q.qlen;
298 	qdisc_qstats_backlog_dec(sch, skb);
299 	qdisc_bstats_update(sch, skb);
300 
301 	return skb;
302 }
303 
choke_reset(struct Qdisc * sch)304 static void choke_reset(struct Qdisc *sch)
305 {
306 	struct choke_sched_data *q = qdisc_priv(sch);
307 
308 	while (q->head != q->tail) {
309 		struct sk_buff *skb = q->tab[q->head];
310 
311 		q->head = (q->head + 1) & q->tab_mask;
312 		if (!skb)
313 			continue;
314 		rtnl_qdisc_drop(skb, sch);
315 	}
316 
317 	if (q->tab)
318 		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
319 	q->head = q->tail = 0;
320 	red_restart(&q->vars);
321 }
322 
323 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
324 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
325 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
326 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
327 };
328 
329 
choke_free(void * addr)330 static void choke_free(void *addr)
331 {
332 	kvfree(addr);
333 }
334 
choke_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)335 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
336 			struct netlink_ext_ack *extack)
337 {
338 	struct choke_sched_data *q = qdisc_priv(sch);
339 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
340 	const struct tc_red_qopt *ctl;
341 	int err;
342 	struct sk_buff **old = NULL;
343 	unsigned int mask;
344 	u32 max_P;
345 	u8 *stab;
346 
347 	if (opt == NULL)
348 		return -EINVAL;
349 
350 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
351 					  choke_policy, NULL);
352 	if (err < 0)
353 		return err;
354 
355 	if (tb[TCA_CHOKE_PARMS] == NULL ||
356 	    tb[TCA_CHOKE_STAB] == NULL)
357 		return -EINVAL;
358 
359 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
360 
361 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
362 	stab = nla_data(tb[TCA_CHOKE_STAB]);
363 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log, stab))
364 		return -EINVAL;
365 
366 	if (ctl->limit > CHOKE_MAX_QUEUE)
367 		return -EINVAL;
368 
369 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
370 	if (mask != q->tab_mask) {
371 		struct sk_buff **ntab;
372 
373 		ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
374 		if (!ntab)
375 			return -ENOMEM;
376 
377 		sch_tree_lock(sch);
378 		old = q->tab;
379 		if (old) {
380 			unsigned int oqlen = sch->q.qlen, tail = 0;
381 			unsigned dropped = 0;
382 
383 			while (q->head != q->tail) {
384 				struct sk_buff *skb = q->tab[q->head];
385 
386 				q->head = (q->head + 1) & q->tab_mask;
387 				if (!skb)
388 					continue;
389 				if (tail < mask) {
390 					ntab[tail++] = skb;
391 					continue;
392 				}
393 				dropped += qdisc_pkt_len(skb);
394 				qdisc_qstats_backlog_dec(sch, skb);
395 				--sch->q.qlen;
396 				rtnl_qdisc_drop(skb, sch);
397 			}
398 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
399 			q->head = 0;
400 			q->tail = tail;
401 		}
402 
403 		q->tab_mask = mask;
404 		q->tab = ntab;
405 	} else
406 		sch_tree_lock(sch);
407 
408 	q->flags = ctl->flags;
409 	q->limit = ctl->limit;
410 
411 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
412 		      ctl->Plog, ctl->Scell_log,
413 		      stab,
414 		      max_P);
415 	red_set_vars(&q->vars);
416 
417 	if (q->head == q->tail)
418 		red_end_of_idle_period(&q->vars);
419 
420 	sch_tree_unlock(sch);
421 	choke_free(old);
422 	return 0;
423 }
424 
choke_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)425 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
426 		      struct netlink_ext_ack *extack)
427 {
428 	return choke_change(sch, opt, extack);
429 }
430 
choke_dump(struct Qdisc * sch,struct sk_buff * skb)431 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
432 {
433 	struct choke_sched_data *q = qdisc_priv(sch);
434 	struct nlattr *opts = NULL;
435 	struct tc_red_qopt opt = {
436 		.limit		= q->limit,
437 		.flags		= q->flags,
438 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
439 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
440 		.Wlog		= q->parms.Wlog,
441 		.Plog		= q->parms.Plog,
442 		.Scell_log	= q->parms.Scell_log,
443 	};
444 
445 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
446 	if (opts == NULL)
447 		goto nla_put_failure;
448 
449 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
450 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
451 		goto nla_put_failure;
452 	return nla_nest_end(skb, opts);
453 
454 nla_put_failure:
455 	nla_nest_cancel(skb, opts);
456 	return -EMSGSIZE;
457 }
458 
choke_dump_stats(struct Qdisc * sch,struct gnet_dump * d)459 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
460 {
461 	struct choke_sched_data *q = qdisc_priv(sch);
462 	struct tc_choke_xstats st = {
463 		.early	= q->stats.prob_drop + q->stats.forced_drop,
464 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
465 		.pdrop	= q->stats.pdrop,
466 		.matched = q->stats.matched,
467 	};
468 
469 	return gnet_stats_copy_app(d, &st, sizeof(st));
470 }
471 
choke_destroy(struct Qdisc * sch)472 static void choke_destroy(struct Qdisc *sch)
473 {
474 	struct choke_sched_data *q = qdisc_priv(sch);
475 
476 	choke_free(q->tab);
477 }
478 
choke_peek_head(struct Qdisc * sch)479 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
480 {
481 	struct choke_sched_data *q = qdisc_priv(sch);
482 
483 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
484 }
485 
486 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
487 	.id		=	"choke",
488 	.priv_size	=	sizeof(struct choke_sched_data),
489 
490 	.enqueue	=	choke_enqueue,
491 	.dequeue	=	choke_dequeue,
492 	.peek		=	choke_peek_head,
493 	.init		=	choke_init,
494 	.destroy	=	choke_destroy,
495 	.reset		=	choke_reset,
496 	.change		=	choke_change,
497 	.dump		=	choke_dump,
498 	.dump_stats	=	choke_dump_stats,
499 	.owner		=	THIS_MODULE,
500 };
501 
choke_module_init(void)502 static int __init choke_module_init(void)
503 {
504 	return register_qdisc(&choke_qdisc_ops);
505 }
506 
choke_module_exit(void)507 static void __exit choke_module_exit(void)
508 {
509 	unregister_qdisc(&choke_qdisc_ops);
510 }
511 
512 module_init(choke_module_init)
513 module_exit(choke_module_exit)
514 
515 MODULE_LICENSE("GPL");
516