xref: /openbmc/linux/net/sched/sch_choke.c (revision a641f3a6)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_keys.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	sch->qstats.backlog -= qdisc_pkt_len(skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 struct choke_skb_cb {
137 	u16			classid;
138 	u8			keys_valid;
139 	struct flow_keys	keys;
140 };
141 
142 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
143 {
144 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
145 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
146 }
147 
148 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
149 {
150 	choke_skb_cb(skb)->classid = classid;
151 }
152 
153 static u16 choke_get_classid(const struct sk_buff *skb)
154 {
155 	return choke_skb_cb(skb)->classid;
156 }
157 
158 /*
159  * Compare flow of two packets
160  *  Returns true only if source and destination address and port match.
161  *          false for special cases
162  */
163 static bool choke_match_flow(struct sk_buff *skb1,
164 			     struct sk_buff *skb2)
165 {
166 	if (skb1->protocol != skb2->protocol)
167 		return false;
168 
169 	if (!choke_skb_cb(skb1)->keys_valid) {
170 		choke_skb_cb(skb1)->keys_valid = 1;
171 		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
172 	}
173 
174 	if (!choke_skb_cb(skb2)->keys_valid) {
175 		choke_skb_cb(skb2)->keys_valid = 1;
176 		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
177 	}
178 
179 	return !memcmp(&choke_skb_cb(skb1)->keys,
180 		       &choke_skb_cb(skb2)->keys,
181 		       sizeof(struct flow_keys));
182 }
183 
184 /*
185  * Classify flow using either:
186  *  1. pre-existing classification result in skb
187  *  2. fast internal classification
188  *  3. use TC filter based classification
189  */
190 static bool choke_classify(struct sk_buff *skb,
191 			   struct Qdisc *sch, int *qerr)
192 
193 {
194 	struct choke_sched_data *q = qdisc_priv(sch);
195 	struct tcf_result res;
196 	int result;
197 
198 	result = tc_classify(skb, q->filter_list, &res);
199 	if (result >= 0) {
200 #ifdef CONFIG_NET_CLS_ACT
201 		switch (result) {
202 		case TC_ACT_STOLEN:
203 		case TC_ACT_QUEUED:
204 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
205 		case TC_ACT_SHOT:
206 			return false;
207 		}
208 #endif
209 		choke_set_classid(skb, TC_H_MIN(res.classid));
210 		return true;
211 	}
212 
213 	return false;
214 }
215 
216 /*
217  * Select a packet at random from queue
218  * HACK: since queue can have holes from previous deletion; retry several
219  *   times to find a random skb but then just give up and return the head
220  * Will return NULL if queue is empty (q->head == q->tail)
221  */
222 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
223 					 unsigned int *pidx)
224 {
225 	struct sk_buff *skb;
226 	int retrys = 3;
227 
228 	do {
229 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
230 		skb = q->tab[*pidx];
231 		if (skb)
232 			return skb;
233 	} while (--retrys > 0);
234 
235 	return q->tab[*pidx = q->head];
236 }
237 
238 /*
239  * Compare new packet with random packet in queue
240  * returns true if matched and sets *pidx
241  */
242 static bool choke_match_random(const struct choke_sched_data *q,
243 			       struct sk_buff *nskb,
244 			       unsigned int *pidx)
245 {
246 	struct sk_buff *oskb;
247 
248 	if (q->head == q->tail)
249 		return false;
250 
251 	oskb = choke_peek_random(q, pidx);
252 	if (q->filter_list)
253 		return choke_get_classid(nskb) == choke_get_classid(oskb);
254 
255 	return choke_match_flow(oskb, nskb);
256 }
257 
258 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
259 {
260 	struct choke_sched_data *q = qdisc_priv(sch);
261 	const struct red_parms *p = &q->parms;
262 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
263 
264 	if (q->filter_list) {
265 		/* If using external classifiers, get result and record it. */
266 		if (!choke_classify(skb, sch, &ret))
267 			goto other_drop;	/* Packet was eaten by filter */
268 	}
269 
270 	choke_skb_cb(skb)->keys_valid = 0;
271 	/* Compute average queue usage (see RED) */
272 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
273 	if (red_is_idling(&q->vars))
274 		red_end_of_idle_period(&q->vars);
275 
276 	/* Is queue small? */
277 	if (q->vars.qavg <= p->qth_min)
278 		q->vars.qcount = -1;
279 	else {
280 		unsigned int idx;
281 
282 		/* Draw a packet at random from queue and compare flow */
283 		if (choke_match_random(q, skb, &idx)) {
284 			q->stats.matched++;
285 			choke_drop_by_idx(sch, idx);
286 			goto congestion_drop;
287 		}
288 
289 		/* Queue is large, always mark/drop */
290 		if (q->vars.qavg > p->qth_max) {
291 			q->vars.qcount = -1;
292 
293 			sch->qstats.overlimits++;
294 			if (use_harddrop(q) || !use_ecn(q) ||
295 			    !INET_ECN_set_ce(skb)) {
296 				q->stats.forced_drop++;
297 				goto congestion_drop;
298 			}
299 
300 			q->stats.forced_mark++;
301 		} else if (++q->vars.qcount) {
302 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
303 				q->vars.qcount = 0;
304 				q->vars.qR = red_random(p);
305 
306 				sch->qstats.overlimits++;
307 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
308 					q->stats.prob_drop++;
309 					goto congestion_drop;
310 				}
311 
312 				q->stats.prob_mark++;
313 			}
314 		} else
315 			q->vars.qR = red_random(p);
316 	}
317 
318 	/* Admit new packet */
319 	if (sch->q.qlen < q->limit) {
320 		q->tab[q->tail] = skb;
321 		q->tail = (q->tail + 1) & q->tab_mask;
322 		++sch->q.qlen;
323 		sch->qstats.backlog += qdisc_pkt_len(skb);
324 		return NET_XMIT_SUCCESS;
325 	}
326 
327 	q->stats.pdrop++;
328 	return qdisc_drop(skb, sch);
329 
330 congestion_drop:
331 	qdisc_drop(skb, sch);
332 	return NET_XMIT_CN;
333 
334 other_drop:
335 	if (ret & __NET_XMIT_BYPASS)
336 		sch->qstats.drops++;
337 	kfree_skb(skb);
338 	return ret;
339 }
340 
341 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
342 {
343 	struct choke_sched_data *q = qdisc_priv(sch);
344 	struct sk_buff *skb;
345 
346 	if (q->head == q->tail) {
347 		if (!red_is_idling(&q->vars))
348 			red_start_of_idle_period(&q->vars);
349 		return NULL;
350 	}
351 
352 	skb = q->tab[q->head];
353 	q->tab[q->head] = NULL;
354 	choke_zap_head_holes(q);
355 	--sch->q.qlen;
356 	sch->qstats.backlog -= qdisc_pkt_len(skb);
357 	qdisc_bstats_update(sch, skb);
358 
359 	return skb;
360 }
361 
362 static unsigned int choke_drop(struct Qdisc *sch)
363 {
364 	struct choke_sched_data *q = qdisc_priv(sch);
365 	unsigned int len;
366 
367 	len = qdisc_queue_drop(sch);
368 	if (len > 0)
369 		q->stats.other++;
370 	else {
371 		if (!red_is_idling(&q->vars))
372 			red_start_of_idle_period(&q->vars);
373 	}
374 
375 	return len;
376 }
377 
378 static void choke_reset(struct Qdisc *sch)
379 {
380 	struct choke_sched_data *q = qdisc_priv(sch);
381 
382 	red_restart(&q->vars);
383 }
384 
385 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
386 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
387 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
388 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
389 };
390 
391 
392 static void choke_free(void *addr)
393 {
394 	kvfree(addr);
395 }
396 
397 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
398 {
399 	struct choke_sched_data *q = qdisc_priv(sch);
400 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
401 	const struct tc_red_qopt *ctl;
402 	int err;
403 	struct sk_buff **old = NULL;
404 	unsigned int mask;
405 	u32 max_P;
406 
407 	if (opt == NULL)
408 		return -EINVAL;
409 
410 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
411 	if (err < 0)
412 		return err;
413 
414 	if (tb[TCA_CHOKE_PARMS] == NULL ||
415 	    tb[TCA_CHOKE_STAB] == NULL)
416 		return -EINVAL;
417 
418 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
419 
420 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
421 
422 	if (ctl->limit > CHOKE_MAX_QUEUE)
423 		return -EINVAL;
424 
425 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
426 	if (mask != q->tab_mask) {
427 		struct sk_buff **ntab;
428 
429 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
430 			       GFP_KERNEL | __GFP_NOWARN);
431 		if (!ntab)
432 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
433 		if (!ntab)
434 			return -ENOMEM;
435 
436 		sch_tree_lock(sch);
437 		old = q->tab;
438 		if (old) {
439 			unsigned int oqlen = sch->q.qlen, tail = 0;
440 
441 			while (q->head != q->tail) {
442 				struct sk_buff *skb = q->tab[q->head];
443 
444 				q->head = (q->head + 1) & q->tab_mask;
445 				if (!skb)
446 					continue;
447 				if (tail < mask) {
448 					ntab[tail++] = skb;
449 					continue;
450 				}
451 				sch->qstats.backlog -= qdisc_pkt_len(skb);
452 				--sch->q.qlen;
453 				qdisc_drop(skb, sch);
454 			}
455 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
456 			q->head = 0;
457 			q->tail = tail;
458 		}
459 
460 		q->tab_mask = mask;
461 		q->tab = ntab;
462 	} else
463 		sch_tree_lock(sch);
464 
465 	q->flags = ctl->flags;
466 	q->limit = ctl->limit;
467 
468 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
469 		      ctl->Plog, ctl->Scell_log,
470 		      nla_data(tb[TCA_CHOKE_STAB]),
471 		      max_P);
472 	red_set_vars(&q->vars);
473 
474 	if (q->head == q->tail)
475 		red_end_of_idle_period(&q->vars);
476 
477 	sch_tree_unlock(sch);
478 	choke_free(old);
479 	return 0;
480 }
481 
482 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
483 {
484 	return choke_change(sch, opt);
485 }
486 
487 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
488 {
489 	struct choke_sched_data *q = qdisc_priv(sch);
490 	struct nlattr *opts = NULL;
491 	struct tc_red_qopt opt = {
492 		.limit		= q->limit,
493 		.flags		= q->flags,
494 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
495 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
496 		.Wlog		= q->parms.Wlog,
497 		.Plog		= q->parms.Plog,
498 		.Scell_log	= q->parms.Scell_log,
499 	};
500 
501 	opts = nla_nest_start(skb, TCA_OPTIONS);
502 	if (opts == NULL)
503 		goto nla_put_failure;
504 
505 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
506 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
507 		goto nla_put_failure;
508 	return nla_nest_end(skb, opts);
509 
510 nla_put_failure:
511 	nla_nest_cancel(skb, opts);
512 	return -EMSGSIZE;
513 }
514 
515 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
516 {
517 	struct choke_sched_data *q = qdisc_priv(sch);
518 	struct tc_choke_xstats st = {
519 		.early	= q->stats.prob_drop + q->stats.forced_drop,
520 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
521 		.pdrop	= q->stats.pdrop,
522 		.other	= q->stats.other,
523 		.matched = q->stats.matched,
524 	};
525 
526 	return gnet_stats_copy_app(d, &st, sizeof(st));
527 }
528 
529 static void choke_destroy(struct Qdisc *sch)
530 {
531 	struct choke_sched_data *q = qdisc_priv(sch);
532 
533 	tcf_destroy_chain(&q->filter_list);
534 	choke_free(q->tab);
535 }
536 
537 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
538 {
539 	return NULL;
540 }
541 
542 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
543 {
544 	return 0;
545 }
546 
547 static void choke_put(struct Qdisc *q, unsigned long cl)
548 {
549 }
550 
551 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
552 				u32 classid)
553 {
554 	return 0;
555 }
556 
557 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
558 {
559 	struct choke_sched_data *q = qdisc_priv(sch);
560 
561 	if (cl)
562 		return NULL;
563 	return &q->filter_list;
564 }
565 
566 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
567 			  struct sk_buff *skb, struct tcmsg *tcm)
568 {
569 	tcm->tcm_handle |= TC_H_MIN(cl);
570 	return 0;
571 }
572 
573 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
574 {
575 	if (!arg->stop) {
576 		if (arg->fn(sch, 1, arg) < 0) {
577 			arg->stop = 1;
578 			return;
579 		}
580 		arg->count++;
581 	}
582 }
583 
584 static const struct Qdisc_class_ops choke_class_ops = {
585 	.leaf		=	choke_leaf,
586 	.get		=	choke_get,
587 	.put		=	choke_put,
588 	.tcf_chain	=	choke_find_tcf,
589 	.bind_tcf	=	choke_bind,
590 	.unbind_tcf	=	choke_put,
591 	.dump		=	choke_dump_class,
592 	.walk		=	choke_walk,
593 };
594 
595 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
596 {
597 	struct choke_sched_data *q = qdisc_priv(sch);
598 
599 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
600 }
601 
602 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
603 	.id		=	"choke",
604 	.priv_size	=	sizeof(struct choke_sched_data),
605 
606 	.enqueue	=	choke_enqueue,
607 	.dequeue	=	choke_dequeue,
608 	.peek		=	choke_peek_head,
609 	.drop		=	choke_drop,
610 	.init		=	choke_init,
611 	.destroy	=	choke_destroy,
612 	.reset		=	choke_reset,
613 	.change		=	choke_change,
614 	.dump		=	choke_dump,
615 	.dump_stats	=	choke_dump_stats,
616 	.owner		=	THIS_MODULE,
617 };
618 
619 static int __init choke_module_init(void)
620 {
621 	return register_qdisc(&choke_qdisc_ops);
622 }
623 
624 static void __exit choke_module_exit(void)
625 {
626 	unregister_qdisc(&choke_qdisc_ops);
627 }
628 
629 module_init(choke_module_init)
630 module_exit(choke_module_exit)
631 
632 MODULE_LICENSE("GPL");
633