xref: /openbmc/linux/net/sched/sch_choke.c (revision dbf4d133)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * net/sched/sch_choke.c	CHOKE scheduler
4  *
5  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
6  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
7  */
8 
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/skbuff.h>
13 #include <linux/vmalloc.h>
14 #include <net/pkt_sched.h>
15 #include <net/pkt_cls.h>
16 #include <net/inet_ecn.h>
17 #include <net/red.h>
18 #include <net/flow_dissector.h>
19 
20 /*
21    CHOKe stateless AQM for fair bandwidth allocation
22    =================================================
23 
24    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
25    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
26    maintains no flow state. The difference from RED is an additional step
27    during the enqueuing process. If average queue size is over the
28    low threshold (qmin), a packet is chosen at random from the queue.
29    If both the new and chosen packet are from the same flow, both
30    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
31    needs to access packets in queue randomly. It has a minimal class
32    interface to allow overriding the builtin flow classifier with
33    filters.
34 
35    Source:
36    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
37    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
38    IEEE INFOCOM, 2000.
39 
40    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
41    Characteristics", IEEE/ACM Transactions on Networking, 2004
42 
43  */
44 
45 /* Upper bound on size of sk_buff table (packets) */
46 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
47 
48 struct choke_sched_data {
49 /* Parameters */
50 	u32		 limit;
51 	unsigned char	 flags;
52 
53 	struct red_parms parms;
54 
55 /* Variables */
56 	struct red_vars  vars;
57 	struct {
58 		u32	prob_drop;	/* Early probability drops */
59 		u32	prob_mark;	/* Early probability marks */
60 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
61 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
62 		u32	pdrop;          /* Drops due to queue limits */
63 		u32	other;          /* Drops due to drop() calls */
64 		u32	matched;	/* Drops to flow match */
65 	} stats;
66 
67 	unsigned int	 head;
68 	unsigned int	 tail;
69 
70 	unsigned int	 tab_mask; /* size - 1 */
71 
72 	struct sk_buff **tab;
73 };
74 
75 /* number of elements in queue including holes */
76 static unsigned int choke_len(const struct choke_sched_data *q)
77 {
78 	return (q->tail - q->head) & q->tab_mask;
79 }
80 
81 /* Is ECN parameter configured */
82 static int use_ecn(const struct choke_sched_data *q)
83 {
84 	return q->flags & TC_RED_ECN;
85 }
86 
87 /* Should packets over max just be dropped (versus marked) */
88 static int use_harddrop(const struct choke_sched_data *q)
89 {
90 	return q->flags & TC_RED_HARDDROP;
91 }
92 
93 /* Move head pointer forward to skip over holes */
94 static void choke_zap_head_holes(struct choke_sched_data *q)
95 {
96 	do {
97 		q->head = (q->head + 1) & q->tab_mask;
98 		if (q->head == q->tail)
99 			break;
100 	} while (q->tab[q->head] == NULL);
101 }
102 
103 /* Move tail pointer backwards to reuse holes */
104 static void choke_zap_tail_holes(struct choke_sched_data *q)
105 {
106 	do {
107 		q->tail = (q->tail - 1) & q->tab_mask;
108 		if (q->head == q->tail)
109 			break;
110 	} while (q->tab[q->tail] == NULL);
111 }
112 
113 /* Drop packet from queue array by creating a "hole" */
114 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
115 			      struct sk_buff **to_free)
116 {
117 	struct choke_sched_data *q = qdisc_priv(sch);
118 	struct sk_buff *skb = q->tab[idx];
119 
120 	q->tab[idx] = NULL;
121 
122 	if (idx == q->head)
123 		choke_zap_head_holes(q);
124 	if (idx == q->tail)
125 		choke_zap_tail_holes(q);
126 
127 	qdisc_qstats_backlog_dec(sch, skb);
128 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
129 	qdisc_drop(skb, sch, to_free);
130 	--sch->q.qlen;
131 }
132 
133 struct choke_skb_cb {
134 	u16			classid;
135 	u8			keys_valid;
136 	struct			flow_keys_digest keys;
137 };
138 
139 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
140 {
141 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
142 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
143 }
144 
145 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
146 {
147 	choke_skb_cb(skb)->classid = classid;
148 }
149 
150 /*
151  * Compare flow of two packets
152  *  Returns true only if source and destination address and port match.
153  *          false for special cases
154  */
155 static bool choke_match_flow(struct sk_buff *skb1,
156 			     struct sk_buff *skb2)
157 {
158 	struct flow_keys temp;
159 
160 	if (skb1->protocol != skb2->protocol)
161 		return false;
162 
163 	if (!choke_skb_cb(skb1)->keys_valid) {
164 		choke_skb_cb(skb1)->keys_valid = 1;
165 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
166 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
167 	}
168 
169 	if (!choke_skb_cb(skb2)->keys_valid) {
170 		choke_skb_cb(skb2)->keys_valid = 1;
171 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
172 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
173 	}
174 
175 	return !memcmp(&choke_skb_cb(skb1)->keys,
176 		       &choke_skb_cb(skb2)->keys,
177 		       sizeof(choke_skb_cb(skb1)->keys));
178 }
179 
180 /*
181  * Select a packet at random from queue
182  * HACK: since queue can have holes from previous deletion; retry several
183  *   times to find a random skb but then just give up and return the head
184  * Will return NULL if queue is empty (q->head == q->tail)
185  */
186 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
187 					 unsigned int *pidx)
188 {
189 	struct sk_buff *skb;
190 	int retrys = 3;
191 
192 	do {
193 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
194 		skb = q->tab[*pidx];
195 		if (skb)
196 			return skb;
197 	} while (--retrys > 0);
198 
199 	return q->tab[*pidx = q->head];
200 }
201 
202 /*
203  * Compare new packet with random packet in queue
204  * returns true if matched and sets *pidx
205  */
206 static bool choke_match_random(const struct choke_sched_data *q,
207 			       struct sk_buff *nskb,
208 			       unsigned int *pidx)
209 {
210 	struct sk_buff *oskb;
211 
212 	if (q->head == q->tail)
213 		return false;
214 
215 	oskb = choke_peek_random(q, pidx);
216 	return choke_match_flow(oskb, nskb);
217 }
218 
219 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
220 			 struct sk_buff **to_free)
221 {
222 	struct choke_sched_data *q = qdisc_priv(sch);
223 	const struct red_parms *p = &q->parms;
224 
225 	choke_skb_cb(skb)->keys_valid = 0;
226 	/* Compute average queue usage (see RED) */
227 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
228 	if (red_is_idling(&q->vars))
229 		red_end_of_idle_period(&q->vars);
230 
231 	/* Is queue small? */
232 	if (q->vars.qavg <= p->qth_min)
233 		q->vars.qcount = -1;
234 	else {
235 		unsigned int idx;
236 
237 		/* Draw a packet at random from queue and compare flow */
238 		if (choke_match_random(q, skb, &idx)) {
239 			q->stats.matched++;
240 			choke_drop_by_idx(sch, idx, to_free);
241 			goto congestion_drop;
242 		}
243 
244 		/* Queue is large, always mark/drop */
245 		if (q->vars.qavg > p->qth_max) {
246 			q->vars.qcount = -1;
247 
248 			qdisc_qstats_overlimit(sch);
249 			if (use_harddrop(q) || !use_ecn(q) ||
250 			    !INET_ECN_set_ce(skb)) {
251 				q->stats.forced_drop++;
252 				goto congestion_drop;
253 			}
254 
255 			q->stats.forced_mark++;
256 		} else if (++q->vars.qcount) {
257 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
258 				q->vars.qcount = 0;
259 				q->vars.qR = red_random(p);
260 
261 				qdisc_qstats_overlimit(sch);
262 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
263 					q->stats.prob_drop++;
264 					goto congestion_drop;
265 				}
266 
267 				q->stats.prob_mark++;
268 			}
269 		} else
270 			q->vars.qR = red_random(p);
271 	}
272 
273 	/* Admit new packet */
274 	if (sch->q.qlen < q->limit) {
275 		q->tab[q->tail] = skb;
276 		q->tail = (q->tail + 1) & q->tab_mask;
277 		++sch->q.qlen;
278 		qdisc_qstats_backlog_inc(sch, skb);
279 		return NET_XMIT_SUCCESS;
280 	}
281 
282 	q->stats.pdrop++;
283 	return qdisc_drop(skb, sch, to_free);
284 
285 congestion_drop:
286 	qdisc_drop(skb, sch, to_free);
287 	return NET_XMIT_CN;
288 }
289 
290 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
291 {
292 	struct choke_sched_data *q = qdisc_priv(sch);
293 	struct sk_buff *skb;
294 
295 	if (q->head == q->tail) {
296 		if (!red_is_idling(&q->vars))
297 			red_start_of_idle_period(&q->vars);
298 		return NULL;
299 	}
300 
301 	skb = q->tab[q->head];
302 	q->tab[q->head] = NULL;
303 	choke_zap_head_holes(q);
304 	--sch->q.qlen;
305 	qdisc_qstats_backlog_dec(sch, skb);
306 	qdisc_bstats_update(sch, skb);
307 
308 	return skb;
309 }
310 
311 static void choke_reset(struct Qdisc *sch)
312 {
313 	struct choke_sched_data *q = qdisc_priv(sch);
314 
315 	while (q->head != q->tail) {
316 		struct sk_buff *skb = q->tab[q->head];
317 
318 		q->head = (q->head + 1) & q->tab_mask;
319 		if (!skb)
320 			continue;
321 		rtnl_qdisc_drop(skb, sch);
322 	}
323 
324 	sch->q.qlen = 0;
325 	sch->qstats.backlog = 0;
326 	if (q->tab)
327 		memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
328 	q->head = q->tail = 0;
329 	red_restart(&q->vars);
330 }
331 
332 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
333 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
334 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
335 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
336 };
337 
338 
339 static void choke_free(void *addr)
340 {
341 	kvfree(addr);
342 }
343 
344 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
345 			struct netlink_ext_ack *extack)
346 {
347 	struct choke_sched_data *q = qdisc_priv(sch);
348 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
349 	const struct tc_red_qopt *ctl;
350 	int err;
351 	struct sk_buff **old = NULL;
352 	unsigned int mask;
353 	u32 max_P;
354 
355 	if (opt == NULL)
356 		return -EINVAL;
357 
358 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
359 					  choke_policy, NULL);
360 	if (err < 0)
361 		return err;
362 
363 	if (tb[TCA_CHOKE_PARMS] == NULL ||
364 	    tb[TCA_CHOKE_STAB] == NULL)
365 		return -EINVAL;
366 
367 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
368 
369 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
370 
371 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
372 		return -EINVAL;
373 
374 	if (ctl->limit > CHOKE_MAX_QUEUE)
375 		return -EINVAL;
376 
377 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
378 	if (mask != q->tab_mask) {
379 		struct sk_buff **ntab;
380 
381 		ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
382 		if (!ntab)
383 			return -ENOMEM;
384 
385 		sch_tree_lock(sch);
386 		old = q->tab;
387 		if (old) {
388 			unsigned int oqlen = sch->q.qlen, tail = 0;
389 			unsigned dropped = 0;
390 
391 			while (q->head != q->tail) {
392 				struct sk_buff *skb = q->tab[q->head];
393 
394 				q->head = (q->head + 1) & q->tab_mask;
395 				if (!skb)
396 					continue;
397 				if (tail < mask) {
398 					ntab[tail++] = skb;
399 					continue;
400 				}
401 				dropped += qdisc_pkt_len(skb);
402 				qdisc_qstats_backlog_dec(sch, skb);
403 				--sch->q.qlen;
404 				rtnl_qdisc_drop(skb, sch);
405 			}
406 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
407 			q->head = 0;
408 			q->tail = tail;
409 		}
410 
411 		q->tab_mask = mask;
412 		q->tab = ntab;
413 	} else
414 		sch_tree_lock(sch);
415 
416 	q->flags = ctl->flags;
417 	q->limit = ctl->limit;
418 
419 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
420 		      ctl->Plog, ctl->Scell_log,
421 		      nla_data(tb[TCA_CHOKE_STAB]),
422 		      max_P);
423 	red_set_vars(&q->vars);
424 
425 	if (q->head == q->tail)
426 		red_end_of_idle_period(&q->vars);
427 
428 	sch_tree_unlock(sch);
429 	choke_free(old);
430 	return 0;
431 }
432 
433 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
434 		      struct netlink_ext_ack *extack)
435 {
436 	return choke_change(sch, opt, extack);
437 }
438 
439 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
440 {
441 	struct choke_sched_data *q = qdisc_priv(sch);
442 	struct nlattr *opts = NULL;
443 	struct tc_red_qopt opt = {
444 		.limit		= q->limit,
445 		.flags		= q->flags,
446 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
447 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
448 		.Wlog		= q->parms.Wlog,
449 		.Plog		= q->parms.Plog,
450 		.Scell_log	= q->parms.Scell_log,
451 	};
452 
453 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
454 	if (opts == NULL)
455 		goto nla_put_failure;
456 
457 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
458 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
459 		goto nla_put_failure;
460 	return nla_nest_end(skb, opts);
461 
462 nla_put_failure:
463 	nla_nest_cancel(skb, opts);
464 	return -EMSGSIZE;
465 }
466 
467 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
468 {
469 	struct choke_sched_data *q = qdisc_priv(sch);
470 	struct tc_choke_xstats st = {
471 		.early	= q->stats.prob_drop + q->stats.forced_drop,
472 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
473 		.pdrop	= q->stats.pdrop,
474 		.other	= q->stats.other,
475 		.matched = q->stats.matched,
476 	};
477 
478 	return gnet_stats_copy_app(d, &st, sizeof(st));
479 }
480 
481 static void choke_destroy(struct Qdisc *sch)
482 {
483 	struct choke_sched_data *q = qdisc_priv(sch);
484 
485 	choke_free(q->tab);
486 }
487 
488 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
489 {
490 	struct choke_sched_data *q = qdisc_priv(sch);
491 
492 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
493 }
494 
495 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
496 	.id		=	"choke",
497 	.priv_size	=	sizeof(struct choke_sched_data),
498 
499 	.enqueue	=	choke_enqueue,
500 	.dequeue	=	choke_dequeue,
501 	.peek		=	choke_peek_head,
502 	.init		=	choke_init,
503 	.destroy	=	choke_destroy,
504 	.reset		=	choke_reset,
505 	.change		=	choke_change,
506 	.dump		=	choke_dump,
507 	.dump_stats	=	choke_dump_stats,
508 	.owner		=	THIS_MODULE,
509 };
510 
511 static int __init choke_module_init(void)
512 {
513 	return register_qdisc(&choke_qdisc_ops);
514 }
515 
516 static void __exit choke_module_exit(void)
517 {
518 	unregister_qdisc(&choke_qdisc_ops);
519 }
520 
521 module_init(choke_module_init)
522 module_exit(choke_module_exit)
523 
524 MODULE_LICENSE("GPL");
525