xref: /openbmc/linux/net/sched/sch_choke.c (revision 4f727ecefefbd180de10e25b3e74c03dce3f1e75)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/pkt_cls.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_dissector.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
119 			      struct sk_buff **to_free)
120 {
121 	struct choke_sched_data *q = qdisc_priv(sch);
122 	struct sk_buff *skb = q->tab[idx];
123 
124 	q->tab[idx] = NULL;
125 
126 	if (idx == q->head)
127 		choke_zap_head_holes(q);
128 	if (idx == q->tail)
129 		choke_zap_tail_holes(q);
130 
131 	qdisc_qstats_backlog_dec(sch, skb);
132 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
133 	qdisc_drop(skb, sch, to_free);
134 	--sch->q.qlen;
135 }
136 
137 struct choke_skb_cb {
138 	u16			classid;
139 	u8			keys_valid;
140 	struct			flow_keys_digest keys;
141 };
142 
143 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
144 {
145 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
146 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
147 }
148 
149 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
150 {
151 	choke_skb_cb(skb)->classid = classid;
152 }
153 
154 /*
155  * Compare flow of two packets
156  *  Returns true only if source and destination address and port match.
157  *          false for special cases
158  */
159 static bool choke_match_flow(struct sk_buff *skb1,
160 			     struct sk_buff *skb2)
161 {
162 	struct flow_keys temp;
163 
164 	if (skb1->protocol != skb2->protocol)
165 		return false;
166 
167 	if (!choke_skb_cb(skb1)->keys_valid) {
168 		choke_skb_cb(skb1)->keys_valid = 1;
169 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
170 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
171 	}
172 
173 	if (!choke_skb_cb(skb2)->keys_valid) {
174 		choke_skb_cb(skb2)->keys_valid = 1;
175 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
176 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
177 	}
178 
179 	return !memcmp(&choke_skb_cb(skb1)->keys,
180 		       &choke_skb_cb(skb2)->keys,
181 		       sizeof(choke_skb_cb(skb1)->keys));
182 }
183 
184 /*
185  * Select a packet at random from queue
186  * HACK: since queue can have holes from previous deletion; retry several
187  *   times to find a random skb but then just give up and return the head
188  * Will return NULL if queue is empty (q->head == q->tail)
189  */
190 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
191 					 unsigned int *pidx)
192 {
193 	struct sk_buff *skb;
194 	int retrys = 3;
195 
196 	do {
197 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
198 		skb = q->tab[*pidx];
199 		if (skb)
200 			return skb;
201 	} while (--retrys > 0);
202 
203 	return q->tab[*pidx = q->head];
204 }
205 
206 /*
207  * Compare new packet with random packet in queue
208  * returns true if matched and sets *pidx
209  */
210 static bool choke_match_random(const struct choke_sched_data *q,
211 			       struct sk_buff *nskb,
212 			       unsigned int *pidx)
213 {
214 	struct sk_buff *oskb;
215 
216 	if (q->head == q->tail)
217 		return false;
218 
219 	oskb = choke_peek_random(q, pidx);
220 	return choke_match_flow(oskb, nskb);
221 }
222 
223 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
224 			 struct sk_buff **to_free)
225 {
226 	struct choke_sched_data *q = qdisc_priv(sch);
227 	const struct red_parms *p = &q->parms;
228 
229 	choke_skb_cb(skb)->keys_valid = 0;
230 	/* Compute average queue usage (see RED) */
231 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
232 	if (red_is_idling(&q->vars))
233 		red_end_of_idle_period(&q->vars);
234 
235 	/* Is queue small? */
236 	if (q->vars.qavg <= p->qth_min)
237 		q->vars.qcount = -1;
238 	else {
239 		unsigned int idx;
240 
241 		/* Draw a packet at random from queue and compare flow */
242 		if (choke_match_random(q, skb, &idx)) {
243 			q->stats.matched++;
244 			choke_drop_by_idx(sch, idx, to_free);
245 			goto congestion_drop;
246 		}
247 
248 		/* Queue is large, always mark/drop */
249 		if (q->vars.qavg > p->qth_max) {
250 			q->vars.qcount = -1;
251 
252 			qdisc_qstats_overlimit(sch);
253 			if (use_harddrop(q) || !use_ecn(q) ||
254 			    !INET_ECN_set_ce(skb)) {
255 				q->stats.forced_drop++;
256 				goto congestion_drop;
257 			}
258 
259 			q->stats.forced_mark++;
260 		} else if (++q->vars.qcount) {
261 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
262 				q->vars.qcount = 0;
263 				q->vars.qR = red_random(p);
264 
265 				qdisc_qstats_overlimit(sch);
266 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
267 					q->stats.prob_drop++;
268 					goto congestion_drop;
269 				}
270 
271 				q->stats.prob_mark++;
272 			}
273 		} else
274 			q->vars.qR = red_random(p);
275 	}
276 
277 	/* Admit new packet */
278 	if (sch->q.qlen < q->limit) {
279 		q->tab[q->tail] = skb;
280 		q->tail = (q->tail + 1) & q->tab_mask;
281 		++sch->q.qlen;
282 		qdisc_qstats_backlog_inc(sch, skb);
283 		return NET_XMIT_SUCCESS;
284 	}
285 
286 	q->stats.pdrop++;
287 	return qdisc_drop(skb, sch, to_free);
288 
289 congestion_drop:
290 	qdisc_drop(skb, sch, to_free);
291 	return NET_XMIT_CN;
292 }
293 
294 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
295 {
296 	struct choke_sched_data *q = qdisc_priv(sch);
297 	struct sk_buff *skb;
298 
299 	if (q->head == q->tail) {
300 		if (!red_is_idling(&q->vars))
301 			red_start_of_idle_period(&q->vars);
302 		return NULL;
303 	}
304 
305 	skb = q->tab[q->head];
306 	q->tab[q->head] = NULL;
307 	choke_zap_head_holes(q);
308 	--sch->q.qlen;
309 	qdisc_qstats_backlog_dec(sch, skb);
310 	qdisc_bstats_update(sch, skb);
311 
312 	return skb;
313 }
314 
315 static void choke_reset(struct Qdisc *sch)
316 {
317 	struct choke_sched_data *q = qdisc_priv(sch);
318 
319 	while (q->head != q->tail) {
320 		struct sk_buff *skb = q->tab[q->head];
321 
322 		q->head = (q->head + 1) & q->tab_mask;
323 		if (!skb)
324 			continue;
325 		rtnl_qdisc_drop(skb, sch);
326 	}
327 
328 	sch->q.qlen = 0;
329 	sch->qstats.backlog = 0;
330 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
331 	q->head = q->tail = 0;
332 	red_restart(&q->vars);
333 }
334 
335 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
336 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
337 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
338 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
339 };
340 
341 
342 static void choke_free(void *addr)
343 {
344 	kvfree(addr);
345 }
346 
347 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
348 			struct netlink_ext_ack *extack)
349 {
350 	struct choke_sched_data *q = qdisc_priv(sch);
351 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
352 	const struct tc_red_qopt *ctl;
353 	int err;
354 	struct sk_buff **old = NULL;
355 	unsigned int mask;
356 	u32 max_P;
357 
358 	if (opt == NULL)
359 		return -EINVAL;
360 
361 	err = nla_parse_nested_deprecated(tb, TCA_CHOKE_MAX, opt,
362 					  choke_policy, NULL);
363 	if (err < 0)
364 		return err;
365 
366 	if (tb[TCA_CHOKE_PARMS] == NULL ||
367 	    tb[TCA_CHOKE_STAB] == NULL)
368 		return -EINVAL;
369 
370 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
371 
372 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
373 
374 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
375 		return -EINVAL;
376 
377 	if (ctl->limit > CHOKE_MAX_QUEUE)
378 		return -EINVAL;
379 
380 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
381 	if (mask != q->tab_mask) {
382 		struct sk_buff **ntab;
383 
384 		ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
385 		if (!ntab)
386 			return -ENOMEM;
387 
388 		sch_tree_lock(sch);
389 		old = q->tab;
390 		if (old) {
391 			unsigned int oqlen = sch->q.qlen, tail = 0;
392 			unsigned dropped = 0;
393 
394 			while (q->head != q->tail) {
395 				struct sk_buff *skb = q->tab[q->head];
396 
397 				q->head = (q->head + 1) & q->tab_mask;
398 				if (!skb)
399 					continue;
400 				if (tail < mask) {
401 					ntab[tail++] = skb;
402 					continue;
403 				}
404 				dropped += qdisc_pkt_len(skb);
405 				qdisc_qstats_backlog_dec(sch, skb);
406 				--sch->q.qlen;
407 				rtnl_qdisc_drop(skb, sch);
408 			}
409 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
410 			q->head = 0;
411 			q->tail = tail;
412 		}
413 
414 		q->tab_mask = mask;
415 		q->tab = ntab;
416 	} else
417 		sch_tree_lock(sch);
418 
419 	q->flags = ctl->flags;
420 	q->limit = ctl->limit;
421 
422 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
423 		      ctl->Plog, ctl->Scell_log,
424 		      nla_data(tb[TCA_CHOKE_STAB]),
425 		      max_P);
426 	red_set_vars(&q->vars);
427 
428 	if (q->head == q->tail)
429 		red_end_of_idle_period(&q->vars);
430 
431 	sch_tree_unlock(sch);
432 	choke_free(old);
433 	return 0;
434 }
435 
436 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
437 		      struct netlink_ext_ack *extack)
438 {
439 	return choke_change(sch, opt, extack);
440 }
441 
442 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
443 {
444 	struct choke_sched_data *q = qdisc_priv(sch);
445 	struct nlattr *opts = NULL;
446 	struct tc_red_qopt opt = {
447 		.limit		= q->limit,
448 		.flags		= q->flags,
449 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
450 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
451 		.Wlog		= q->parms.Wlog,
452 		.Plog		= q->parms.Plog,
453 		.Scell_log	= q->parms.Scell_log,
454 	};
455 
456 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
457 	if (opts == NULL)
458 		goto nla_put_failure;
459 
460 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
461 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
462 		goto nla_put_failure;
463 	return nla_nest_end(skb, opts);
464 
465 nla_put_failure:
466 	nla_nest_cancel(skb, opts);
467 	return -EMSGSIZE;
468 }
469 
470 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
471 {
472 	struct choke_sched_data *q = qdisc_priv(sch);
473 	struct tc_choke_xstats st = {
474 		.early	= q->stats.prob_drop + q->stats.forced_drop,
475 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
476 		.pdrop	= q->stats.pdrop,
477 		.other	= q->stats.other,
478 		.matched = q->stats.matched,
479 	};
480 
481 	return gnet_stats_copy_app(d, &st, sizeof(st));
482 }
483 
484 static void choke_destroy(struct Qdisc *sch)
485 {
486 	struct choke_sched_data *q = qdisc_priv(sch);
487 
488 	choke_free(q->tab);
489 }
490 
491 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
492 {
493 	struct choke_sched_data *q = qdisc_priv(sch);
494 
495 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
496 }
497 
498 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
499 	.id		=	"choke",
500 	.priv_size	=	sizeof(struct choke_sched_data),
501 
502 	.enqueue	=	choke_enqueue,
503 	.dequeue	=	choke_dequeue,
504 	.peek		=	choke_peek_head,
505 	.init		=	choke_init,
506 	.destroy	=	choke_destroy,
507 	.reset		=	choke_reset,
508 	.change		=	choke_change,
509 	.dump		=	choke_dump,
510 	.dump_stats	=	choke_dump_stats,
511 	.owner		=	THIS_MODULE,
512 };
513 
514 static int __init choke_module_init(void)
515 {
516 	return register_qdisc(&choke_qdisc_ops);
517 }
518 
519 static void __exit choke_module_exit(void)
520 {
521 	unregister_qdisc(&choke_qdisc_ops);
522 }
523 
524 module_init(choke_module_init)
525 module_exit(choke_module_exit)
526 
527 MODULE_LICENSE("GPL");
528