xref: /openbmc/linux/net/sched/sch_choke.c (revision 8795a739)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_choke.c	CHOKE scheduler
4  *
5  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7  */
8 
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/skbuff.h>
13 #include <linux/vmalloc.h>
14 #include <net/pkt_sched.h>
15 #include <net/pkt_cls.h>
16 #include <net/inet_ecn.h>
17 #include <net/red.h>
18 #include <net/flow_dissector.h>
19 
20 /*
21    CHOKe stateless AQM for fair bandwidth allocation
22    =================================================
23 
24    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26    maintains no flow state. The difference from RED is an additional step
27    during the enqueuing process. If average queue size is over the
28    low threshold (qmin), a packet is chosen at random from the queue.
29    If both the new and chosen packet are from the same flow, both
30    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31    needs to access packets in queue randomly. It has a minimal class
32    interface to allow overriding the builtin flow classifier with
33    filters.
34 
35    Source:
36    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38    IEEE INFOCOM, 2000.
39 
40    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41    Characteristics", IEEE/ACM Transactions on Networking, 2004
42 
43  */
44 
45 /* Upper bound on size of sk_buff table (packets) */
46 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
47 
48 struct choke_sched_data {
49 /* Parameters */
50 	u32		 limit;
51 	unsigned char	 flags;
52 
53 	struct red_parms parms;
54 
55 /* Variables */
56 	struct red_vars  vars;
57 	struct {
58 		u32	prob_drop;	/* Early probability drops */
59 		u32	prob_mark;	/* Early probability marks */
60 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62 		u32	pdrop;          /* Drops due to queue limits */
63 		u32	other;          /* Drops due to drop() calls */
64 		u32	matched;	/* Drops to flow match */
65 	} stats;
66 
67 	unsigned int	 head;
68 	unsigned int	 tail;
69 
70 	unsigned int	 tab_mask; /* size - 1 */
71 
72 	struct sk_buff **tab;
73 };
74 
75 /* number of elements in queue including holes */
76 static unsigned int choke_len(const struct choke_sched_data *q)
77 {
78 	return (q->tail - q->head) & q->tab_mask;
79 }
80 
81 /* Is ECN parameter configured */
82 static int use_ecn(const struct choke_sched_data *q)
83 {
84 	return q->flags & TC_RED_ECN;
85 }
86 
87 /* Should packets over max just be dropped (versus marked) */
88 static int use_harddrop(const struct choke_sched_data *q)
89 {
90 	return q->flags & TC_RED_HARDDROP;
91 }
92 
93 /* Move head pointer forward to skip over holes */
94 static void choke_zap_head_holes(struct choke_sched_data *q)
95 {
96 	do {
97 		q->head = (q->head + 1) & q->tab_mask;
98 		if (q->head == q->tail)
99 			break;
100 	} while (q->tab[q->head] == NULL);
101 }
102 
103 /* Move tail pointer backwards to reuse holes */
104 static void choke_zap_tail_holes(struct choke_sched_data *q)
105 {
106 	do {
107 		q->tail = (q->tail - 1) & q->tab_mask;
108 		if (q->head == q->tail)
109 			break;
110 	} while (q->tab[q->tail] == NULL);
111 }
112 
113 /* Drop packet from queue array by creating a "hole" */
114 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
115 			      struct sk_buff **to_free)
116 {
117 	struct choke_sched_data *q = qdisc_priv(sch);
118 	struct sk_buff *skb = q->tab[idx];
119 
120 	q->tab[idx] = NULL;
121 
122 	if (idx == q->head)
123 		choke_zap_head_holes(q);
124 	if (idx == q->tail)
125 		choke_zap_tail_holes(q);
126 
127 	qdisc_qstats_backlog_dec(sch, skb);
128 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
129 	qdisc_drop(skb, sch, to_free);
130 	--sch->q.qlen;
131 }
132 
133 struct choke_skb_cb {
134 	u16			classid;
135 	u8			keys_valid;
136 	struct			flow_keys_digest keys;
137 };
138 
139 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
140 {
141 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
142 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
143 }
144 
145 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
146 {
147 	choke_skb_cb(skb)->classid = classid;
148 }
149 
150 /*
151  * Compare flow of two packets
152  *  Returns true only if source and destination address and port match.
153  *          false for special cases
154  */
155 static bool choke_match_flow(struct sk_buff *skb1,
156 			     struct sk_buff *skb2)
157 {
158 	struct flow_keys temp;
159 
160 	if (skb1->protocol != skb2->protocol)
161 		return false;
162 
163 	if (!choke_skb_cb(skb1)->keys_valid) {
164 		choke_skb_cb(skb1)->keys_valid = 1;
165 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
166 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
167 	}
168 
169 	if (!choke_skb_cb(skb2)->keys_valid) {
170 		choke_skb_cb(skb2)->keys_valid = 1;
171 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
172 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
173 	}
174 
175 	return !memcmp(&choke_skb_cb(skb1)->keys,
176 		       &choke_skb_cb(skb2)->keys,
177 		       sizeof(choke_skb_cb(skb1)->keys));
178 }
179 
180 /*
181  * Select a packet at random from queue
182  * HACK: since queue can have holes from previous deletion; retry several
183  *   times to find a random skb but then just give up and return the head
184  * Will return NULL if queue is empty (q->head == q->tail)
185  */
186 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
187 					 unsigned int *pidx)
188 {
189 	struct sk_buff *skb;
190 	int retrys = 3;
191 
192 	do {
193 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
194 		skb = q->tab[*pidx];
195 		if (skb)
196 			return skb;
197 	} while (--retrys > 0);
198 
199 	return q->tab[*pidx = q->head];
200 }
201 
202 /*
203  * Compare new packet with random packet in queue
204  * returns true if matched and sets *pidx
205  */
206 static bool choke_match_random(const struct choke_sched_data *q,
207 			       struct sk_buff *nskb,
208 			       unsigned int *pidx)
209 {
210 	struct sk_buff *oskb;
211 
212 	if (q->head == q->tail)
213 		return false;
214 
215 	oskb = choke_peek_random(q, pidx);
216 	return choke_match_flow(oskb, nskb);
217 }
218 
219 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
220 			 struct sk_buff **to_free)
221 {
222 	struct choke_sched_data *q = qdisc_priv(sch);
223 	const struct red_parms *p = &q->parms;
224 
225 	choke_skb_cb(skb)->keys_valid = 0;
226 	/* Compute average queue usage (see RED) */
227 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
228 	if (red_is_idling(&q->vars))
229 		red_end_of_idle_period(&q->vars);
230 
231 	/* Is queue small? */
232 	if (q->vars.qavg <= p->qth_min)
233 		q->vars.qcount = -1;
234 	else {
235 		unsigned int idx;
236 
237 		/* Draw a packet at random from queue and compare flow */
238 		if (choke_match_random(q, skb, &idx)) {
239 			q->stats.matched++;
240 			choke_drop_by_idx(sch, idx, to_free);
241 			goto congestion_drop;
242 		}
243 
244 		/* Queue is large, always mark/drop */
245 		if (q->vars.qavg > p->qth_max) {
246 			q->vars.qcount = -1;
247 
248 			qdisc_qstats_overlimit(sch);
249 			if (use_harddrop(q) || !use_ecn(q) ||
250 			    !INET_ECN_set_ce(skb)) {
251 				q->stats.forced_drop++;
252 				goto congestion_drop;
253 			}
254 
255 			q->stats.forced_mark++;
256 		} else if (++q->vars.qcount) {
257 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
258 				q->vars.qcount = 0;
259 				q->vars.qR = red_random(p);
260 
261 				qdisc_qstats_overlimit(sch);
262 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
263 					q->stats.prob_drop++;
264 					goto congestion_drop;
265 				}
266 
267 				q->stats.prob_mark++;
268 			}
269 		} else
270 			q->vars.qR = red_random(p);
271 	}
272 
273 	/* Admit new packet */
274 	if (sch->q.qlen < q->limit) {
275 		q->tab[q->tail] = skb;
276 		q->tail = (q->tail + 1) & q->tab_mask;
277 		++sch->q.qlen;
278 		qdisc_qstats_backlog_inc(sch, skb);
279 		return NET_XMIT_SUCCESS;
280 	}
281 
282 	q->stats.pdrop++;
283 	return qdisc_drop(skb, sch, to_free);
284 
285 congestion_drop:
286 	qdisc_drop(skb, sch, to_free);
287 	return NET_XMIT_CN;
288 }
289 
290 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
291 {
292 	struct choke_sched_data *q = qdisc_priv(sch);
293 	struct sk_buff *skb;
294 
295 	if (q->head == q->tail) {
296 		if (!red_is_idling(&q->vars))
297 			red_start_of_idle_period(&q->vars);
298 		return NULL;
299 	}
300 
301 	skb = q->tab[q->head];
302 	q->tab[q->head] = NULL;
303 	choke_zap_head_holes(q);
304 	--sch->q.qlen;
305 	qdisc_qstats_backlog_dec(sch, skb);
306 	qdisc_bstats_update(sch, skb);
307 
308 	return skb;
309 }
310 
311 static void choke_reset(struct Qdisc *sch)
312 {
313 	struct choke_sched_data *q = qdisc_priv(sch);
314 
315 	while (q->head != q->tail) {
316 		struct sk_buff *skb = q->tab[q->head];
317 
318 		q->head = (q->head + 1) & q->tab_mask;
319 		if (!skb)
320 			continue;
321 		rtnl_qdisc_drop(skb, sch);
322 	}
323 
324 	sch->q.qlen = 0;
325 	sch->qstats.backlog = 0;
326 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
327 	q->head = q->tail = 0;
328 	red_restart(&q->vars);
329 }
330 
331 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
332 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
333 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
334 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
335 };
336 
337 
338 static void choke_free(void *addr)
339 {
340 	kvfree(addr);
341 }
342 
343 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
344 			struct netlink_ext_ack *extack)
345 {
346 	struct choke_sched_data *q = qdisc_priv(sch);
347 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
348 	const struct tc_red_qopt *ctl;
349 	int err;
350 	struct sk_buff **old = NULL;
351 	unsigned int mask;
352 	u32 max_P;
353 
354 	if (opt == NULL)
355 		return -EINVAL;
356 
357 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
358 					  choke_policy, NULL);
359 	if (err < 0)
360 		return err;
361 
362 	if (tb[TCA_CHOKE_PARMS] == NULL ||
363 	    tb[TCA_CHOKE_STAB] == NULL)
364 		return -EINVAL;
365 
366 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
367 
368 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
369 
370 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
371 		return -EINVAL;
372 
373 	if (ctl->limit > CHOKE_MAX_QUEUE)
374 		return -EINVAL;
375 
376 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
377 	if (mask != q->tab_mask) {
378 		struct sk_buff **ntab;
379 
380 		ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
381 		if (!ntab)
382 			return -ENOMEM;
383 
384 		sch_tree_lock(sch);
385 		old = q->tab;
386 		if (old) {
387 			unsigned int oqlen = sch->q.qlen, tail = 0;
388 			unsigned dropped = 0;
389 
390 			while (q->head != q->tail) {
391 				struct sk_buff *skb = q->tab[q->head];
392 
393 				q->head = (q->head + 1) & q->tab_mask;
394 				if (!skb)
395 					continue;
396 				if (tail < mask) {
397 					ntab[tail++] = skb;
398 					continue;
399 				}
400 				dropped += qdisc_pkt_len(skb);
401 				qdisc_qstats_backlog_dec(sch, skb);
402 				--sch->q.qlen;
403 				rtnl_qdisc_drop(skb, sch);
404 			}
405 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
406 			q->head = 0;
407 			q->tail = tail;
408 		}
409 
410 		q->tab_mask = mask;
411 		q->tab = ntab;
412 	} else
413 		sch_tree_lock(sch);
414 
415 	q->flags = ctl->flags;
416 	q->limit = ctl->limit;
417 
418 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
419 		      ctl->Plog, ctl->Scell_log,
420 		      nla_data(tb[TCA_CHOKE_STAB]),
421 		      max_P);
422 	red_set_vars(&q->vars);
423 
424 	if (q->head == q->tail)
425 		red_end_of_idle_period(&q->vars);
426 
427 	sch_tree_unlock(sch);
428 	choke_free(old);
429 	return 0;
430 }
431 
432 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
433 		      struct netlink_ext_ack *extack)
434 {
435 	return choke_change(sch, opt, extack);
436 }
437 
438 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
439 {
440 	struct choke_sched_data *q = qdisc_priv(sch);
441 	struct nlattr *opts = NULL;
442 	struct tc_red_qopt opt = {
443 		.limit		= q->limit,
444 		.flags		= q->flags,
445 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
446 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
447 		.Wlog		= q->parms.Wlog,
448 		.Plog		= q->parms.Plog,
449 		.Scell_log	= q->parms.Scell_log,
450 	};
451 
452 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
453 	if (opts == NULL)
454 		goto nla_put_failure;
455 
456 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
457 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
458 		goto nla_put_failure;
459 	return nla_nest_end(skb, opts);
460 
461 nla_put_failure:
462 	nla_nest_cancel(skb, opts);
463 	return -EMSGSIZE;
464 }
465 
466 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
467 {
468 	struct choke_sched_data *q = qdisc_priv(sch);
469 	struct tc_choke_xstats st = {
470 		.early	= q->stats.prob_drop + q->stats.forced_drop,
471 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
472 		.pdrop	= q->stats.pdrop,
473 		.other	= q->stats.other,
474 		.matched = q->stats.matched,
475 	};
476 
477 	return gnet_stats_copy_app(d, &st, sizeof(st));
478 }
479 
480 static void choke_destroy(struct Qdisc *sch)
481 {
482 	struct choke_sched_data *q = qdisc_priv(sch);
483 
484 	choke_free(q->tab);
485 }
486 
487 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
488 {
489 	struct choke_sched_data *q = qdisc_priv(sch);
490 
491 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
492 }
493 
494 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
495 	.id		=	"choke",
496 	.priv_size	=	sizeof(struct choke_sched_data),
497 
498 	.enqueue	=	choke_enqueue,
499 	.dequeue	=	choke_dequeue,
500 	.peek		=	choke_peek_head,
501 	.init		=	choke_init,
502 	.destroy	=	choke_destroy,
503 	.reset		=	choke_reset,
504 	.change		=	choke_change,
505 	.dump		=	choke_dump,
506 	.dump_stats	=	choke_dump_stats,
507 	.owner		=	THIS_MODULE,
508 };
509 
510 static int __init choke_module_init(void)
511 {
512 	return register_qdisc(&choke_qdisc_ops);
513 }
514 
515 static void __exit choke_module_exit(void)
516 {
517 	unregister_qdisc(&choke_qdisc_ops);
518 }
519 
520 module_init(choke_module_init)
521 module_exit(choke_module_exit)
522 
523 MODULE_LICENSE("GPL");
524