xref: /openbmc/linux/net/sched/sch_choke.c (revision 75b1a8f9d62e50f05d0e4e9f3c8bcde32527ffc1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_choke.c	CHOKE scheduler
4  *
5  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7  */
8 
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/skbuff.h>
13 #include <linux/vmalloc.h>
14 #include <net/pkt_sched.h>
15 #include <net/pkt_cls.h>
16 #include <net/inet_ecn.h>
17 #include <net/red.h>
18 #include <net/flow_dissector.h>
19 
20 /*
21    CHOKe stateless AQM for fair bandwidth allocation
22    =================================================
23 
24    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26    maintains no flow state. The difference from RED is an additional step
27    during the enqueuing process. If average queue size is over the
28    low threshold (qmin), a packet is chosen at random from the queue.
29    If both the new and chosen packet are from the same flow, both
30    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31    needs to access packets in queue randomly. It has a minimal class
32    interface to allow overriding the builtin flow classifier with
33    filters.
34 
35    Source:
36    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38    IEEE INFOCOM, 2000.
39 
40    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41    Characteristics", IEEE/ACM Transactions on Networking, 2004
42 
43  */
44 
45 /* Upper bound on size of sk_buff table (packets) */
46 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
47 
48 struct choke_sched_data {
49 /* Parameters */
50 	u32		 limit;
51 	unsigned char	 flags;
52 
53 	struct red_parms parms;
54 
55 /* Variables */
56 	struct red_vars  vars;
57 	struct {
58 		u32	prob_drop;	/* Early probability drops */
59 		u32	prob_mark;	/* Early probability marks */
60 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62 		u32	pdrop;          /* Drops due to queue limits */
63 		u32	other;          /* Drops due to drop() calls */
64 		u32	matched;	/* Drops to flow match */
65 	} stats;
66 
67 	unsigned int	 head;
68 	unsigned int	 tail;
69 
70 	unsigned int	 tab_mask; /* size - 1 */
71 
72 	struct sk_buff **tab;
73 };
74 
75 /* number of elements in queue including holes */
76 static unsigned int choke_len(const struct choke_sched_data *q)
77 {
78 	return (q->tail - q->head) & q->tab_mask;
79 }
80 
81 /* Is ECN parameter configured */
82 static int use_ecn(const struct choke_sched_data *q)
83 {
84 	return q->flags & TC_RED_ECN;
85 }
86 
87 /* Should packets over max just be dropped (versus marked) */
88 static int use_harddrop(const struct choke_sched_data *q)
89 {
90 	return q->flags & TC_RED_HARDDROP;
91 }
92 
93 /* Move head pointer forward to skip over holes */
94 static void choke_zap_head_holes(struct choke_sched_data *q)
95 {
96 	do {
97 		q->head = (q->head + 1) & q->tab_mask;
98 		if (q->head == q->tail)
99 			break;
100 	} while (q->tab[q->head] == NULL);
101 }
102 
103 /* Move tail pointer backwards to reuse holes */
104 static void choke_zap_tail_holes(struct choke_sched_data *q)
105 {
106 	do {
107 		q->tail = (q->tail - 1) & q->tab_mask;
108 		if (q->head == q->tail)
109 			break;
110 	} while (q->tab[q->tail] == NULL);
111 }
112 
113 /* Drop packet from queue array by creating a "hole" */
114 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
115 			      struct sk_buff **to_free)
116 {
117 	struct choke_sched_data *q = qdisc_priv(sch);
118 	struct sk_buff *skb = q->tab[idx];
119 
120 	q->tab[idx] = NULL;
121 
122 	if (idx == q->head)
123 		choke_zap_head_holes(q);
124 	if (idx == q->tail)
125 		choke_zap_tail_holes(q);
126 
127 	qdisc_qstats_backlog_dec(sch, skb);
128 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
129 	qdisc_drop(skb, sch, to_free);
130 	--sch->q.qlen;
131 }
132 
133 struct choke_skb_cb {
134 	u8			keys_valid;
135 	struct			flow_keys_digest keys;
136 };
137 
138 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
139 {
140 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
141 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
142 }
143 
144 /*
145  * Compare flow of two packets
146  *  Returns true only if source and destination address and port match.
147  *          false for special cases
148  */
149 static bool choke_match_flow(struct sk_buff *skb1,
150 			     struct sk_buff *skb2)
151 {
152 	struct flow_keys temp;
153 
154 	if (skb1->protocol != skb2->protocol)
155 		return false;
156 
157 	if (!choke_skb_cb(skb1)->keys_valid) {
158 		choke_skb_cb(skb1)->keys_valid = 1;
159 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
160 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
161 	}
162 
163 	if (!choke_skb_cb(skb2)->keys_valid) {
164 		choke_skb_cb(skb2)->keys_valid = 1;
165 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
166 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
167 	}
168 
169 	return !memcmp(&choke_skb_cb(skb1)->keys,
170 		       &choke_skb_cb(skb2)->keys,
171 		       sizeof(choke_skb_cb(skb1)->keys));
172 }
173 
174 /*
175  * Select a packet at random from queue
176  * HACK: since queue can have holes from previous deletion; retry several
177  *   times to find a random skb but then just give up and return the head
178  * Will return NULL if queue is empty (q->head == q->tail)
179  */
180 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
181 					 unsigned int *pidx)
182 {
183 	struct sk_buff *skb;
184 	int retrys = 3;
185 
186 	do {
187 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
188 		skb = q->tab[*pidx];
189 		if (skb)
190 			return skb;
191 	} while (--retrys > 0);
192 
193 	return q->tab[*pidx = q->head];
194 }
195 
196 /*
197  * Compare new packet with random packet in queue
198  * returns true if matched and sets *pidx
199  */
200 static bool choke_match_random(const struct choke_sched_data *q,
201 			       struct sk_buff *nskb,
202 			       unsigned int *pidx)
203 {
204 	struct sk_buff *oskb;
205 
206 	if (q->head == q->tail)
207 		return false;
208 
209 	oskb = choke_peek_random(q, pidx);
210 	return choke_match_flow(oskb, nskb);
211 }
212 
213 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
214 			 struct sk_buff **to_free)
215 {
216 	struct choke_sched_data *q = qdisc_priv(sch);
217 	const struct red_parms *p = &q->parms;
218 
219 	choke_skb_cb(skb)->keys_valid = 0;
220 	/* Compute average queue usage (see RED) */
221 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
222 	if (red_is_idling(&q->vars))
223 		red_end_of_idle_period(&q->vars);
224 
225 	/* Is queue small? */
226 	if (q->vars.qavg <= p->qth_min)
227 		q->vars.qcount = -1;
228 	else {
229 		unsigned int idx;
230 
231 		/* Draw a packet at random from queue and compare flow */
232 		if (choke_match_random(q, skb, &idx)) {
233 			q->stats.matched++;
234 			choke_drop_by_idx(sch, idx, to_free);
235 			goto congestion_drop;
236 		}
237 
238 		/* Queue is large, always mark/drop */
239 		if (q->vars.qavg > p->qth_max) {
240 			q->vars.qcount = -1;
241 
242 			qdisc_qstats_overlimit(sch);
243 			if (use_harddrop(q) || !use_ecn(q) ||
244 			    !INET_ECN_set_ce(skb)) {
245 				q->stats.forced_drop++;
246 				goto congestion_drop;
247 			}
248 
249 			q->stats.forced_mark++;
250 		} else if (++q->vars.qcount) {
251 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
252 				q->vars.qcount = 0;
253 				q->vars.qR = red_random(p);
254 
255 				qdisc_qstats_overlimit(sch);
256 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
257 					q->stats.prob_drop++;
258 					goto congestion_drop;
259 				}
260 
261 				q->stats.prob_mark++;
262 			}
263 		} else
264 			q->vars.qR = red_random(p);
265 	}
266 
267 	/* Admit new packet */
268 	if (sch->q.qlen < q->limit) {
269 		q->tab[q->tail] = skb;
270 		q->tail = (q->tail + 1) & q->tab_mask;
271 		++sch->q.qlen;
272 		qdisc_qstats_backlog_inc(sch, skb);
273 		return NET_XMIT_SUCCESS;
274 	}
275 
276 	q->stats.pdrop++;
277 	return qdisc_drop(skb, sch, to_free);
278 
279 congestion_drop:
280 	qdisc_drop(skb, sch, to_free);
281 	return NET_XMIT_CN;
282 }
283 
284 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
285 {
286 	struct choke_sched_data *q = qdisc_priv(sch);
287 	struct sk_buff *skb;
288 
289 	if (q->head == q->tail) {
290 		if (!red_is_idling(&q->vars))
291 			red_start_of_idle_period(&q->vars);
292 		return NULL;
293 	}
294 
295 	skb = q->tab[q->head];
296 	q->tab[q->head] = NULL;
297 	choke_zap_head_holes(q);
298 	--sch->q.qlen;
299 	qdisc_qstats_backlog_dec(sch, skb);
300 	qdisc_bstats_update(sch, skb);
301 
302 	return skb;
303 }
304 
305 static void choke_reset(struct Qdisc *sch)
306 {
307 	struct choke_sched_data *q = qdisc_priv(sch);
308 
309 	while (q->head != q->tail) {
310 		struct sk_buff *skb = q->tab[q->head];
311 
312 		q->head = (q->head + 1) & q->tab_mask;
313 		if (!skb)
314 			continue;
315 		rtnl_qdisc_drop(skb, sch);
316 	}
317 
318 	sch->q.qlen = 0;
319 	sch->qstats.backlog = 0;
320 	if (q->tab)
321 		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
322 	q->head = q->tail = 0;
323 	red_restart(&q->vars);
324 }
325 
326 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
327 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
328 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
329 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
330 };
331 
332 
333 static void choke_free(void *addr)
334 {
335 	kvfree(addr);
336 }
337 
338 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
339 			struct netlink_ext_ack *extack)
340 {
341 	struct choke_sched_data *q = qdisc_priv(sch);
342 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
343 	const struct tc_red_qopt *ctl;
344 	int err;
345 	struct sk_buff **old = NULL;
346 	unsigned int mask;
347 	u32 max_P;
348 
349 	if (opt == NULL)
350 		return -EINVAL;
351 
352 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
353 					  choke_policy, NULL);
354 	if (err < 0)
355 		return err;
356 
357 	if (tb[TCA_CHOKE_PARMS] == NULL ||
358 	    tb[TCA_CHOKE_STAB] == NULL)
359 		return -EINVAL;
360 
361 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
362 
363 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
364 
365 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
366 		return -EINVAL;
367 
368 	if (ctl->limit > CHOKE_MAX_QUEUE)
369 		return -EINVAL;
370 
371 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
372 	if (mask != q->tab_mask) {
373 		struct sk_buff **ntab;
374 
375 		ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
376 		if (!ntab)
377 			return -ENOMEM;
378 
379 		sch_tree_lock(sch);
380 		old = q->tab;
381 		if (old) {
382 			unsigned int oqlen = sch->q.qlen, tail = 0;
383 			unsigned dropped = 0;
384 
385 			while (q->head != q->tail) {
386 				struct sk_buff *skb = q->tab[q->head];
387 
388 				q->head = (q->head + 1) & q->tab_mask;
389 				if (!skb)
390 					continue;
391 				if (tail < mask) {
392 					ntab[tail++] = skb;
393 					continue;
394 				}
395 				dropped += qdisc_pkt_len(skb);
396 				qdisc_qstats_backlog_dec(sch, skb);
397 				--sch->q.qlen;
398 				rtnl_qdisc_drop(skb, sch);
399 			}
400 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
401 			q->head = 0;
402 			q->tail = tail;
403 		}
404 
405 		q->tab_mask = mask;
406 		q->tab = ntab;
407 	} else
408 		sch_tree_lock(sch);
409 
410 	q->flags = ctl->flags;
411 	q->limit = ctl->limit;
412 
413 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
414 		      ctl->Plog, ctl->Scell_log,
415 		      nla_data(tb[TCA_CHOKE_STAB]),
416 		      max_P);
417 	red_set_vars(&q->vars);
418 
419 	if (q->head == q->tail)
420 		red_end_of_idle_period(&q->vars);
421 
422 	sch_tree_unlock(sch);
423 	choke_free(old);
424 	return 0;
425 }
426 
427 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
428 		      struct netlink_ext_ack *extack)
429 {
430 	return choke_change(sch, opt, extack);
431 }
432 
433 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
434 {
435 	struct choke_sched_data *q = qdisc_priv(sch);
436 	struct nlattr *opts = NULL;
437 	struct tc_red_qopt opt = {
438 		.limit		= q->limit,
439 		.flags		= q->flags,
440 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
441 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
442 		.Wlog		= q->parms.Wlog,
443 		.Plog		= q->parms.Plog,
444 		.Scell_log	= q->parms.Scell_log,
445 	};
446 
447 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
448 	if (opts == NULL)
449 		goto nla_put_failure;
450 
451 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
452 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
453 		goto nla_put_failure;
454 	return nla_nest_end(skb, opts);
455 
456 nla_put_failure:
457 	nla_nest_cancel(skb, opts);
458 	return -EMSGSIZE;
459 }
460 
461 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
462 {
463 	struct choke_sched_data *q = qdisc_priv(sch);
464 	struct tc_choke_xstats st = {
465 		.early	= q->stats.prob_drop + q->stats.forced_drop,
466 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
467 		.pdrop	= q->stats.pdrop,
468 		.other	= q->stats.other,
469 		.matched = q->stats.matched,
470 	};
471 
472 	return gnet_stats_copy_app(d, &st, sizeof(st));
473 }
474 
475 static void choke_destroy(struct Qdisc *sch)
476 {
477 	struct choke_sched_data *q = qdisc_priv(sch);
478 
479 	choke_free(q->tab);
480 }
481 
482 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
483 {
484 	struct choke_sched_data *q = qdisc_priv(sch);
485 
486 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
487 }
488 
489 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
490 	.id		=	"choke",
491 	.priv_size	=	sizeof(struct choke_sched_data),
492 
493 	.enqueue	=	choke_enqueue,
494 	.dequeue	=	choke_dequeue,
495 	.peek		=	choke_peek_head,
496 	.init		=	choke_init,
497 	.destroy	=	choke_destroy,
498 	.reset		=	choke_reset,
499 	.change		=	choke_change,
500 	.dump		=	choke_dump,
501 	.dump_stats	=	choke_dump_stats,
502 	.owner		=	THIS_MODULE,
503 };
504 
505 static int __init choke_module_init(void)
506 {
507 	return register_qdisc(&choke_qdisc_ops);
508 }
509 
510 static void __exit choke_module_exit(void)
511 {
512 	unregister_qdisc(&choke_qdisc_ops);
513 }
514 
515 module_init(choke_module_init)
516 module_exit(choke_module_exit)
517 
518 MODULE_LICENSE("GPL");
519