xref: /openbmc/linux/net/sched/sch_choke.c (revision f7777dcc)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/reciprocal_div.h>
18 #include <linux/vmalloc.h>
19 #include <net/pkt_sched.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <net/flow_keys.h>
23 
24 /*
25    CHOKe stateless AQM for fair bandwidth allocation
26    =================================================
27 
28    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
29    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
30    maintains no flow state. The difference from RED is an additional step
31    during the enqueuing process. If average queue size is over the
32    low threshold (qmin), a packet is chosen at random from the queue.
33    If both the new and chosen packet are from the same flow, both
34    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
35    needs to access packets in queue randomly. It has a minimal class
36    interface to allow overriding the builtin flow classifier with
37    filters.
38 
39    Source:
40    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
41    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
42    IEEE INFOCOM, 2000.
43 
44    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
45    Characteristics", IEEE/ACM Transactions on Networking, 2004
46 
47  */
48 
49 /* Upper bound on size of sk_buff table (packets) */
50 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
51 
52 struct choke_sched_data {
53 /* Parameters */
54 	u32		 limit;
55 	unsigned char	 flags;
56 
57 	struct red_parms parms;
58 
59 /* Variables */
60 	struct red_vars  vars;
61 	struct tcf_proto *filter_list;
62 	struct {
63 		u32	prob_drop;	/* Early probability drops */
64 		u32	prob_mark;	/* Early probability marks */
65 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
66 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
67 		u32	pdrop;          /* Drops due to queue limits */
68 		u32	other;          /* Drops due to drop() calls */
69 		u32	matched;	/* Drops to flow match */
70 	} stats;
71 
72 	unsigned int	 head;
73 	unsigned int	 tail;
74 
75 	unsigned int	 tab_mask; /* size - 1 */
76 
77 	struct sk_buff **tab;
78 };
79 
80 /* deliver a random number between 0 and N - 1 */
81 static u32 random_N(unsigned int N)
82 {
83 	return reciprocal_divide(prandom_u32(), N);
84 }
85 
86 /* number of elements in queue including holes */
87 static unsigned int choke_len(const struct choke_sched_data *q)
88 {
89 	return (q->tail - q->head) & q->tab_mask;
90 }
91 
92 /* Is ECN parameter configured */
93 static int use_ecn(const struct choke_sched_data *q)
94 {
95 	return q->flags & TC_RED_ECN;
96 }
97 
98 /* Should packets over max just be dropped (versus marked) */
99 static int use_harddrop(const struct choke_sched_data *q)
100 {
101 	return q->flags & TC_RED_HARDDROP;
102 }
103 
104 /* Move head pointer forward to skip over holes */
105 static void choke_zap_head_holes(struct choke_sched_data *q)
106 {
107 	do {
108 		q->head = (q->head + 1) & q->tab_mask;
109 		if (q->head == q->tail)
110 			break;
111 	} while (q->tab[q->head] == NULL);
112 }
113 
114 /* Move tail pointer backwards to reuse holes */
115 static void choke_zap_tail_holes(struct choke_sched_data *q)
116 {
117 	do {
118 		q->tail = (q->tail - 1) & q->tab_mask;
119 		if (q->head == q->tail)
120 			break;
121 	} while (q->tab[q->tail] == NULL);
122 }
123 
124 /* Drop packet from queue array by creating a "hole" */
125 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
126 {
127 	struct choke_sched_data *q = qdisc_priv(sch);
128 	struct sk_buff *skb = q->tab[idx];
129 
130 	q->tab[idx] = NULL;
131 
132 	if (idx == q->head)
133 		choke_zap_head_holes(q);
134 	if (idx == q->tail)
135 		choke_zap_tail_holes(q);
136 
137 	sch->qstats.backlog -= qdisc_pkt_len(skb);
138 	qdisc_drop(skb, sch);
139 	qdisc_tree_decrease_qlen(sch, 1);
140 	--sch->q.qlen;
141 }
142 
143 struct choke_skb_cb {
144 	u16			classid;
145 	u8			keys_valid;
146 	struct flow_keys	keys;
147 };
148 
149 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
150 {
151 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
152 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
153 }
154 
155 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
156 {
157 	choke_skb_cb(skb)->classid = classid;
158 }
159 
160 static u16 choke_get_classid(const struct sk_buff *skb)
161 {
162 	return choke_skb_cb(skb)->classid;
163 }
164 
165 /*
166  * Compare flow of two packets
167  *  Returns true only if source and destination address and port match.
168  *          false for special cases
169  */
170 static bool choke_match_flow(struct sk_buff *skb1,
171 			     struct sk_buff *skb2)
172 {
173 	if (skb1->protocol != skb2->protocol)
174 		return false;
175 
176 	if (!choke_skb_cb(skb1)->keys_valid) {
177 		choke_skb_cb(skb1)->keys_valid = 1;
178 		skb_flow_dissect(skb1, &choke_skb_cb(skb1)->keys);
179 	}
180 
181 	if (!choke_skb_cb(skb2)->keys_valid) {
182 		choke_skb_cb(skb2)->keys_valid = 1;
183 		skb_flow_dissect(skb2, &choke_skb_cb(skb2)->keys);
184 	}
185 
186 	return !memcmp(&choke_skb_cb(skb1)->keys,
187 		       &choke_skb_cb(skb2)->keys,
188 		       sizeof(struct flow_keys));
189 }
190 
191 /*
192  * Classify flow using either:
193  *  1. pre-existing classification result in skb
194  *  2. fast internal classification
195  *  3. use TC filter based classification
196  */
197 static bool choke_classify(struct sk_buff *skb,
198 			   struct Qdisc *sch, int *qerr)
199 
200 {
201 	struct choke_sched_data *q = qdisc_priv(sch);
202 	struct tcf_result res;
203 	int result;
204 
205 	result = tc_classify(skb, q->filter_list, &res);
206 	if (result >= 0) {
207 #ifdef CONFIG_NET_CLS_ACT
208 		switch (result) {
209 		case TC_ACT_STOLEN:
210 		case TC_ACT_QUEUED:
211 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
212 		case TC_ACT_SHOT:
213 			return false;
214 		}
215 #endif
216 		choke_set_classid(skb, TC_H_MIN(res.classid));
217 		return true;
218 	}
219 
220 	return false;
221 }
222 
223 /*
224  * Select a packet at random from queue
225  * HACK: since queue can have holes from previous deletion; retry several
226  *   times to find a random skb but then just give up and return the head
227  * Will return NULL if queue is empty (q->head == q->tail)
228  */
229 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
230 					 unsigned int *pidx)
231 {
232 	struct sk_buff *skb;
233 	int retrys = 3;
234 
235 	do {
236 		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
237 		skb = q->tab[*pidx];
238 		if (skb)
239 			return skb;
240 	} while (--retrys > 0);
241 
242 	return q->tab[*pidx = q->head];
243 }
244 
245 /*
246  * Compare new packet with random packet in queue
247  * returns true if matched and sets *pidx
248  */
249 static bool choke_match_random(const struct choke_sched_data *q,
250 			       struct sk_buff *nskb,
251 			       unsigned int *pidx)
252 {
253 	struct sk_buff *oskb;
254 
255 	if (q->head == q->tail)
256 		return false;
257 
258 	oskb = choke_peek_random(q, pidx);
259 	if (q->filter_list)
260 		return choke_get_classid(nskb) == choke_get_classid(oskb);
261 
262 	return choke_match_flow(oskb, nskb);
263 }
264 
265 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
266 {
267 	struct choke_sched_data *q = qdisc_priv(sch);
268 	const struct red_parms *p = &q->parms;
269 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
270 
271 	if (q->filter_list) {
272 		/* If using external classifiers, get result and record it. */
273 		if (!choke_classify(skb, sch, &ret))
274 			goto other_drop;	/* Packet was eaten by filter */
275 	}
276 
277 	choke_skb_cb(skb)->keys_valid = 0;
278 	/* Compute average queue usage (see RED) */
279 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
280 	if (red_is_idling(&q->vars))
281 		red_end_of_idle_period(&q->vars);
282 
283 	/* Is queue small? */
284 	if (q->vars.qavg <= p->qth_min)
285 		q->vars.qcount = -1;
286 	else {
287 		unsigned int idx;
288 
289 		/* Draw a packet at random from queue and compare flow */
290 		if (choke_match_random(q, skb, &idx)) {
291 			q->stats.matched++;
292 			choke_drop_by_idx(sch, idx);
293 			goto congestion_drop;
294 		}
295 
296 		/* Queue is large, always mark/drop */
297 		if (q->vars.qavg > p->qth_max) {
298 			q->vars.qcount = -1;
299 
300 			sch->qstats.overlimits++;
301 			if (use_harddrop(q) || !use_ecn(q) ||
302 			    !INET_ECN_set_ce(skb)) {
303 				q->stats.forced_drop++;
304 				goto congestion_drop;
305 			}
306 
307 			q->stats.forced_mark++;
308 		} else if (++q->vars.qcount) {
309 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
310 				q->vars.qcount = 0;
311 				q->vars.qR = red_random(p);
312 
313 				sch->qstats.overlimits++;
314 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
315 					q->stats.prob_drop++;
316 					goto congestion_drop;
317 				}
318 
319 				q->stats.prob_mark++;
320 			}
321 		} else
322 			q->vars.qR = red_random(p);
323 	}
324 
325 	/* Admit new packet */
326 	if (sch->q.qlen < q->limit) {
327 		q->tab[q->tail] = skb;
328 		q->tail = (q->tail + 1) & q->tab_mask;
329 		++sch->q.qlen;
330 		sch->qstats.backlog += qdisc_pkt_len(skb);
331 		return NET_XMIT_SUCCESS;
332 	}
333 
334 	q->stats.pdrop++;
335 	return qdisc_drop(skb, sch);
336 
337 congestion_drop:
338 	qdisc_drop(skb, sch);
339 	return NET_XMIT_CN;
340 
341 other_drop:
342 	if (ret & __NET_XMIT_BYPASS)
343 		sch->qstats.drops++;
344 	kfree_skb(skb);
345 	return ret;
346 }
347 
348 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
349 {
350 	struct choke_sched_data *q = qdisc_priv(sch);
351 	struct sk_buff *skb;
352 
353 	if (q->head == q->tail) {
354 		if (!red_is_idling(&q->vars))
355 			red_start_of_idle_period(&q->vars);
356 		return NULL;
357 	}
358 
359 	skb = q->tab[q->head];
360 	q->tab[q->head] = NULL;
361 	choke_zap_head_holes(q);
362 	--sch->q.qlen;
363 	sch->qstats.backlog -= qdisc_pkt_len(skb);
364 	qdisc_bstats_update(sch, skb);
365 
366 	return skb;
367 }
368 
369 static unsigned int choke_drop(struct Qdisc *sch)
370 {
371 	struct choke_sched_data *q = qdisc_priv(sch);
372 	unsigned int len;
373 
374 	len = qdisc_queue_drop(sch);
375 	if (len > 0)
376 		q->stats.other++;
377 	else {
378 		if (!red_is_idling(&q->vars))
379 			red_start_of_idle_period(&q->vars);
380 	}
381 
382 	return len;
383 }
384 
385 static void choke_reset(struct Qdisc *sch)
386 {
387 	struct choke_sched_data *q = qdisc_priv(sch);
388 
389 	red_restart(&q->vars);
390 }
391 
392 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
393 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
394 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
395 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
396 };
397 
398 
399 static void choke_free(void *addr)
400 {
401 	if (addr) {
402 		if (is_vmalloc_addr(addr))
403 			vfree(addr);
404 		else
405 			kfree(addr);
406 	}
407 }
408 
409 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
410 {
411 	struct choke_sched_data *q = qdisc_priv(sch);
412 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
413 	const struct tc_red_qopt *ctl;
414 	int err;
415 	struct sk_buff **old = NULL;
416 	unsigned int mask;
417 	u32 max_P;
418 
419 	if (opt == NULL)
420 		return -EINVAL;
421 
422 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
423 	if (err < 0)
424 		return err;
425 
426 	if (tb[TCA_CHOKE_PARMS] == NULL ||
427 	    tb[TCA_CHOKE_STAB] == NULL)
428 		return -EINVAL;
429 
430 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
431 
432 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
433 
434 	if (ctl->limit > CHOKE_MAX_QUEUE)
435 		return -EINVAL;
436 
437 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
438 	if (mask != q->tab_mask) {
439 		struct sk_buff **ntab;
440 
441 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
442 			       GFP_KERNEL | __GFP_NOWARN);
443 		if (!ntab)
444 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
445 		if (!ntab)
446 			return -ENOMEM;
447 
448 		sch_tree_lock(sch);
449 		old = q->tab;
450 		if (old) {
451 			unsigned int oqlen = sch->q.qlen, tail = 0;
452 
453 			while (q->head != q->tail) {
454 				struct sk_buff *skb = q->tab[q->head];
455 
456 				q->head = (q->head + 1) & q->tab_mask;
457 				if (!skb)
458 					continue;
459 				if (tail < mask) {
460 					ntab[tail++] = skb;
461 					continue;
462 				}
463 				sch->qstats.backlog -= qdisc_pkt_len(skb);
464 				--sch->q.qlen;
465 				qdisc_drop(skb, sch);
466 			}
467 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
468 			q->head = 0;
469 			q->tail = tail;
470 		}
471 
472 		q->tab_mask = mask;
473 		q->tab = ntab;
474 	} else
475 		sch_tree_lock(sch);
476 
477 	q->flags = ctl->flags;
478 	q->limit = ctl->limit;
479 
480 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
481 		      ctl->Plog, ctl->Scell_log,
482 		      nla_data(tb[TCA_CHOKE_STAB]),
483 		      max_P);
484 	red_set_vars(&q->vars);
485 
486 	if (q->head == q->tail)
487 		red_end_of_idle_period(&q->vars);
488 
489 	sch_tree_unlock(sch);
490 	choke_free(old);
491 	return 0;
492 }
493 
494 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
495 {
496 	return choke_change(sch, opt);
497 }
498 
499 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
500 {
501 	struct choke_sched_data *q = qdisc_priv(sch);
502 	struct nlattr *opts = NULL;
503 	struct tc_red_qopt opt = {
504 		.limit		= q->limit,
505 		.flags		= q->flags,
506 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
507 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
508 		.Wlog		= q->parms.Wlog,
509 		.Plog		= q->parms.Plog,
510 		.Scell_log	= q->parms.Scell_log,
511 	};
512 
513 	opts = nla_nest_start(skb, TCA_OPTIONS);
514 	if (opts == NULL)
515 		goto nla_put_failure;
516 
517 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
518 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
519 		goto nla_put_failure;
520 	return nla_nest_end(skb, opts);
521 
522 nla_put_failure:
523 	nla_nest_cancel(skb, opts);
524 	return -EMSGSIZE;
525 }
526 
527 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
528 {
529 	struct choke_sched_data *q = qdisc_priv(sch);
530 	struct tc_choke_xstats st = {
531 		.early	= q->stats.prob_drop + q->stats.forced_drop,
532 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
533 		.pdrop	= q->stats.pdrop,
534 		.other	= q->stats.other,
535 		.matched = q->stats.matched,
536 	};
537 
538 	return gnet_stats_copy_app(d, &st, sizeof(st));
539 }
540 
541 static void choke_destroy(struct Qdisc *sch)
542 {
543 	struct choke_sched_data *q = qdisc_priv(sch);
544 
545 	tcf_destroy_chain(&q->filter_list);
546 	choke_free(q->tab);
547 }
548 
549 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
550 {
551 	return NULL;
552 }
553 
554 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
555 {
556 	return 0;
557 }
558 
559 static void choke_put(struct Qdisc *q, unsigned long cl)
560 {
561 }
562 
563 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
564 				u32 classid)
565 {
566 	return 0;
567 }
568 
569 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
570 {
571 	struct choke_sched_data *q = qdisc_priv(sch);
572 
573 	if (cl)
574 		return NULL;
575 	return &q->filter_list;
576 }
577 
578 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
579 			  struct sk_buff *skb, struct tcmsg *tcm)
580 {
581 	tcm->tcm_handle |= TC_H_MIN(cl);
582 	return 0;
583 }
584 
585 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
586 {
587 	if (!arg->stop) {
588 		if (arg->fn(sch, 1, arg) < 0) {
589 			arg->stop = 1;
590 			return;
591 		}
592 		arg->count++;
593 	}
594 }
595 
596 static const struct Qdisc_class_ops choke_class_ops = {
597 	.leaf		=	choke_leaf,
598 	.get		=	choke_get,
599 	.put		=	choke_put,
600 	.tcf_chain	=	choke_find_tcf,
601 	.bind_tcf	=	choke_bind,
602 	.unbind_tcf	=	choke_put,
603 	.dump		=	choke_dump_class,
604 	.walk		=	choke_walk,
605 };
606 
607 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
608 {
609 	struct choke_sched_data *q = qdisc_priv(sch);
610 
611 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
612 }
613 
614 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
615 	.id		=	"choke",
616 	.priv_size	=	sizeof(struct choke_sched_data),
617 
618 	.enqueue	=	choke_enqueue,
619 	.dequeue	=	choke_dequeue,
620 	.peek		=	choke_peek_head,
621 	.drop		=	choke_drop,
622 	.init		=	choke_init,
623 	.destroy	=	choke_destroy,
624 	.reset		=	choke_reset,
625 	.change		=	choke_change,
626 	.dump		=	choke_dump,
627 	.dump_stats	=	choke_dump_stats,
628 	.owner		=	THIS_MODULE,
629 };
630 
631 static int __init choke_module_init(void)
632 {
633 	return register_qdisc(&choke_qdisc_ops);
634 }
635 
636 static void __exit choke_module_exit(void)
637 {
638 	unregister_qdisc(&choke_qdisc_ops);
639 }
640 
641 module_init(choke_module_init)
642 module_exit(choke_module_exit)
643 
644 MODULE_LICENSE("GPL");
645