xref: /openbmc/linux/net/sched/sch_choke.c (revision 82e6fdd6)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/pkt_cls.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_dissector.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
119 			      struct sk_buff **to_free)
120 {
121 	struct choke_sched_data *q = qdisc_priv(sch);
122 	struct sk_buff *skb = q->tab[idx];
123 
124 	q->tab[idx] = NULL;
125 
126 	if (idx == q->head)
127 		choke_zap_head_holes(q);
128 	if (idx == q->tail)
129 		choke_zap_tail_holes(q);
130 
131 	qdisc_qstats_backlog_dec(sch, skb);
132 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
133 	qdisc_drop(skb, sch, to_free);
134 	--sch->q.qlen;
135 }
136 
137 struct choke_skb_cb {
138 	u16			classid;
139 	u8			keys_valid;
140 	struct			flow_keys_digest keys;
141 };
142 
143 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
144 {
145 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
146 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
147 }
148 
149 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
150 {
151 	choke_skb_cb(skb)->classid = classid;
152 }
153 
154 /*
155  * Compare flow of two packets
156  *  Returns true only if source and destination address and port match.
157  *          false for special cases
158  */
159 static bool choke_match_flow(struct sk_buff *skb1,
160 			     struct sk_buff *skb2)
161 {
162 	struct flow_keys temp;
163 
164 	if (skb1->protocol != skb2->protocol)
165 		return false;
166 
167 	if (!choke_skb_cb(skb1)->keys_valid) {
168 		choke_skb_cb(skb1)->keys_valid = 1;
169 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
170 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
171 	}
172 
173 	if (!choke_skb_cb(skb2)->keys_valid) {
174 		choke_skb_cb(skb2)->keys_valid = 1;
175 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
176 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
177 	}
178 
179 	return !memcmp(&choke_skb_cb(skb1)->keys,
180 		       &choke_skb_cb(skb2)->keys,
181 		       sizeof(choke_skb_cb(skb1)->keys));
182 }
183 
184 /*
185  * Select a packet at random from queue
186  * HACK: since queue can have holes from previous deletion; retry several
187  *   times to find a random skb but then just give up and return the head
188  * Will return NULL if queue is empty (q->head == q->tail)
189  */
190 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
191 					 unsigned int *pidx)
192 {
193 	struct sk_buff *skb;
194 	int retrys = 3;
195 
196 	do {
197 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
198 		skb = q->tab[*pidx];
199 		if (skb)
200 			return skb;
201 	} while (--retrys > 0);
202 
203 	return q->tab[*pidx = q->head];
204 }
205 
206 /*
207  * Compare new packet with random packet in queue
208  * returns true if matched and sets *pidx
209  */
210 static bool choke_match_random(const struct choke_sched_data *q,
211 			       struct sk_buff *nskb,
212 			       unsigned int *pidx)
213 {
214 	struct sk_buff *oskb;
215 
216 	if (q->head == q->tail)
217 		return false;
218 
219 	oskb = choke_peek_random(q, pidx);
220 	return choke_match_flow(oskb, nskb);
221 }
222 
223 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
224 			 struct sk_buff **to_free)
225 {
226 	struct choke_sched_data *q = qdisc_priv(sch);
227 	const struct red_parms *p = &q->parms;
228 
229 	choke_skb_cb(skb)->keys_valid = 0;
230 	/* Compute average queue usage (see RED) */
231 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
232 	if (red_is_idling(&q->vars))
233 		red_end_of_idle_period(&q->vars);
234 
235 	/* Is queue small? */
236 	if (q->vars.qavg <= p->qth_min)
237 		q->vars.qcount = -1;
238 	else {
239 		unsigned int idx;
240 
241 		/* Draw a packet at random from queue and compare flow */
242 		if (choke_match_random(q, skb, &idx)) {
243 			q->stats.matched++;
244 			choke_drop_by_idx(sch, idx, to_free);
245 			goto congestion_drop;
246 		}
247 
248 		/* Queue is large, always mark/drop */
249 		if (q->vars.qavg > p->qth_max) {
250 			q->vars.qcount = -1;
251 
252 			qdisc_qstats_overlimit(sch);
253 			if (use_harddrop(q) || !use_ecn(q) ||
254 			    !INET_ECN_set_ce(skb)) {
255 				q->stats.forced_drop++;
256 				goto congestion_drop;
257 			}
258 
259 			q->stats.forced_mark++;
260 		} else if (++q->vars.qcount) {
261 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
262 				q->vars.qcount = 0;
263 				q->vars.qR = red_random(p);
264 
265 				qdisc_qstats_overlimit(sch);
266 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
267 					q->stats.prob_drop++;
268 					goto congestion_drop;
269 				}
270 
271 				q->stats.prob_mark++;
272 			}
273 		} else
274 			q->vars.qR = red_random(p);
275 	}
276 
277 	/* Admit new packet */
278 	if (sch->q.qlen < q->limit) {
279 		q->tab[q->tail] = skb;
280 		q->tail = (q->tail + 1) & q->tab_mask;
281 		++sch->q.qlen;
282 		qdisc_qstats_backlog_inc(sch, skb);
283 		return NET_XMIT_SUCCESS;
284 	}
285 
286 	q->stats.pdrop++;
287 	return qdisc_drop(skb, sch, to_free);
288 
289 congestion_drop:
290 	qdisc_drop(skb, sch, to_free);
291 	return NET_XMIT_CN;
292 }
293 
294 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
295 {
296 	struct choke_sched_data *q = qdisc_priv(sch);
297 	struct sk_buff *skb;
298 
299 	if (q->head == q->tail) {
300 		if (!red_is_idling(&q->vars))
301 			red_start_of_idle_period(&q->vars);
302 		return NULL;
303 	}
304 
305 	skb = q->tab[q->head];
306 	q->tab[q->head] = NULL;
307 	choke_zap_head_holes(q);
308 	--sch->q.qlen;
309 	qdisc_qstats_backlog_dec(sch, skb);
310 	qdisc_bstats_update(sch, skb);
311 
312 	return skb;
313 }
314 
315 static void choke_reset(struct Qdisc *sch)
316 {
317 	struct choke_sched_data *q = qdisc_priv(sch);
318 
319 	while (q->head != q->tail) {
320 		struct sk_buff *skb = q->tab[q->head];
321 
322 		q->head = (q->head + 1) & q->tab_mask;
323 		if (!skb)
324 			continue;
325 		rtnl_qdisc_drop(skb, sch);
326 	}
327 
328 	sch->q.qlen = 0;
329 	sch->qstats.backlog = 0;
330 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
331 	q->head = q->tail = 0;
332 	red_restart(&q->vars);
333 }
334 
335 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
336 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
337 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
338 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
339 };
340 
341 
342 static void choke_free(void *addr)
343 {
344 	kvfree(addr);
345 }
346 
347 static int choke_change(struct Qdisc *sch, struct nlattr *opt,
348 			struct netlink_ext_ack *extack)
349 {
350 	struct choke_sched_data *q = qdisc_priv(sch);
351 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
352 	const struct tc_red_qopt *ctl;
353 	int err;
354 	struct sk_buff **old = NULL;
355 	unsigned int mask;
356 	u32 max_P;
357 
358 	if (opt == NULL)
359 		return -EINVAL;
360 
361 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL);
362 	if (err < 0)
363 		return err;
364 
365 	if (tb[TCA_CHOKE_PARMS] == NULL ||
366 	    tb[TCA_CHOKE_STAB] == NULL)
367 		return -EINVAL;
368 
369 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
370 
371 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
372 
373 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
374 		return -EINVAL;
375 
376 	if (ctl->limit > CHOKE_MAX_QUEUE)
377 		return -EINVAL;
378 
379 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
380 	if (mask != q->tab_mask) {
381 		struct sk_buff **ntab;
382 
383 		ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
384 		if (!ntab)
385 			return -ENOMEM;
386 
387 		sch_tree_lock(sch);
388 		old = q->tab;
389 		if (old) {
390 			unsigned int oqlen = sch->q.qlen, tail = 0;
391 			unsigned dropped = 0;
392 
393 			while (q->head != q->tail) {
394 				struct sk_buff *skb = q->tab[q->head];
395 
396 				q->head = (q->head + 1) & q->tab_mask;
397 				if (!skb)
398 					continue;
399 				if (tail < mask) {
400 					ntab[tail++] = skb;
401 					continue;
402 				}
403 				dropped += qdisc_pkt_len(skb);
404 				qdisc_qstats_backlog_dec(sch, skb);
405 				--sch->q.qlen;
406 				rtnl_qdisc_drop(skb, sch);
407 			}
408 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
409 			q->head = 0;
410 			q->tail = tail;
411 		}
412 
413 		q->tab_mask = mask;
414 		q->tab = ntab;
415 	} else
416 		sch_tree_lock(sch);
417 
418 	q->flags = ctl->flags;
419 	q->limit = ctl->limit;
420 
421 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
422 		      ctl->Plog, ctl->Scell_log,
423 		      nla_data(tb[TCA_CHOKE_STAB]),
424 		      max_P);
425 	red_set_vars(&q->vars);
426 
427 	if (q->head == q->tail)
428 		red_end_of_idle_period(&q->vars);
429 
430 	sch_tree_unlock(sch);
431 	choke_free(old);
432 	return 0;
433 }
434 
435 static int choke_init(struct Qdisc *sch, struct nlattr *opt,
436 		      struct netlink_ext_ack *extack)
437 {
438 	return choke_change(sch, opt, extack);
439 }
440 
441 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
442 {
443 	struct choke_sched_data *q = qdisc_priv(sch);
444 	struct nlattr *opts = NULL;
445 	struct tc_red_qopt opt = {
446 		.limit		= q->limit,
447 		.flags		= q->flags,
448 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
449 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
450 		.Wlog		= q->parms.Wlog,
451 		.Plog		= q->parms.Plog,
452 		.Scell_log	= q->parms.Scell_log,
453 	};
454 
455 	opts = nla_nest_start(skb, TCA_OPTIONS);
456 	if (opts == NULL)
457 		goto nla_put_failure;
458 
459 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
460 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
461 		goto nla_put_failure;
462 	return nla_nest_end(skb, opts);
463 
464 nla_put_failure:
465 	nla_nest_cancel(skb, opts);
466 	return -EMSGSIZE;
467 }
468 
469 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
470 {
471 	struct choke_sched_data *q = qdisc_priv(sch);
472 	struct tc_choke_xstats st = {
473 		.early	= q->stats.prob_drop + q->stats.forced_drop,
474 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
475 		.pdrop	= q->stats.pdrop,
476 		.other	= q->stats.other,
477 		.matched = q->stats.matched,
478 	};
479 
480 	return gnet_stats_copy_app(d, &st, sizeof(st));
481 }
482 
483 static void choke_destroy(struct Qdisc *sch)
484 {
485 	struct choke_sched_data *q = qdisc_priv(sch);
486 
487 	choke_free(q->tab);
488 }
489 
490 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
491 {
492 	struct choke_sched_data *q = qdisc_priv(sch);
493 
494 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
495 }
496 
497 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
498 	.id		=	"choke",
499 	.priv_size	=	sizeof(struct choke_sched_data),
500 
501 	.enqueue	=	choke_enqueue,
502 	.dequeue	=	choke_dequeue,
503 	.peek		=	choke_peek_head,
504 	.init		=	choke_init,
505 	.destroy	=	choke_destroy,
506 	.reset		=	choke_reset,
507 	.change		=	choke_change,
508 	.dump		=	choke_dump,
509 	.dump_stats	=	choke_dump_stats,
510 	.owner		=	THIS_MODULE,
511 };
512 
513 static int __init choke_module_init(void)
514 {
515 	return register_qdisc(&choke_qdisc_ops);
516 }
517 
518 static void __exit choke_module_exit(void)
519 {
520 	unregister_qdisc(&choke_qdisc_ops);
521 }
522 
523 module_init(choke_module_init)
524 module_exit(choke_module_exit)
525 
526 MODULE_LICENSE("GPL");
527