xref: /openbmc/linux/net/sched/sch_choke.c (revision 92b19ff5)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_dissector.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto __rcu *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	qdisc_qstats_backlog_dec(sch, skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 struct choke_skb_cb {
137 	u16			classid;
138 	u8			keys_valid;
139 	struct			flow_keys_digest keys;
140 };
141 
142 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
143 {
144 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
145 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
146 }
147 
148 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
149 {
150 	choke_skb_cb(skb)->classid = classid;
151 }
152 
153 static u16 choke_get_classid(const struct sk_buff *skb)
154 {
155 	return choke_skb_cb(skb)->classid;
156 }
157 
158 /*
159  * Compare flow of two packets
160  *  Returns true only if source and destination address and port match.
161  *          false for special cases
162  */
163 static bool choke_match_flow(struct sk_buff *skb1,
164 			     struct sk_buff *skb2)
165 {
166 	struct flow_keys temp;
167 
168 	if (skb1->protocol != skb2->protocol)
169 		return false;
170 
171 	if (!choke_skb_cb(skb1)->keys_valid) {
172 		choke_skb_cb(skb1)->keys_valid = 1;
173 		skb_flow_dissect_flow_keys(skb1, &temp);
174 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
175 	}
176 
177 	if (!choke_skb_cb(skb2)->keys_valid) {
178 		choke_skb_cb(skb2)->keys_valid = 1;
179 		skb_flow_dissect_flow_keys(skb2, &temp);
180 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
181 	}
182 
183 	return !memcmp(&choke_skb_cb(skb1)->keys,
184 		       &choke_skb_cb(skb2)->keys,
185 		       sizeof(choke_skb_cb(skb1)->keys));
186 }
187 
188 /*
189  * Classify flow using either:
190  *  1. pre-existing classification result in skb
191  *  2. fast internal classification
192  *  3. use TC filter based classification
193  */
194 static bool choke_classify(struct sk_buff *skb,
195 			   struct Qdisc *sch, int *qerr)
196 
197 {
198 	struct choke_sched_data *q = qdisc_priv(sch);
199 	struct tcf_result res;
200 	struct tcf_proto *fl;
201 	int result;
202 
203 	fl = rcu_dereference_bh(q->filter_list);
204 	result = tc_classify(skb, fl, &res);
205 	if (result >= 0) {
206 #ifdef CONFIG_NET_CLS_ACT
207 		switch (result) {
208 		case TC_ACT_STOLEN:
209 		case TC_ACT_QUEUED:
210 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
211 		case TC_ACT_SHOT:
212 			return false;
213 		}
214 #endif
215 		choke_set_classid(skb, TC_H_MIN(res.classid));
216 		return true;
217 	}
218 
219 	return false;
220 }
221 
222 /*
223  * Select a packet at random from queue
224  * HACK: since queue can have holes from previous deletion; retry several
225  *   times to find a random skb but then just give up and return the head
226  * Will return NULL if queue is empty (q->head == q->tail)
227  */
228 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
229 					 unsigned int *pidx)
230 {
231 	struct sk_buff *skb;
232 	int retrys = 3;
233 
234 	do {
235 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
236 		skb = q->tab[*pidx];
237 		if (skb)
238 			return skb;
239 	} while (--retrys > 0);
240 
241 	return q->tab[*pidx = q->head];
242 }
243 
244 /*
245  * Compare new packet with random packet in queue
246  * returns true if matched and sets *pidx
247  */
248 static bool choke_match_random(const struct choke_sched_data *q,
249 			       struct sk_buff *nskb,
250 			       unsigned int *pidx)
251 {
252 	struct sk_buff *oskb;
253 
254 	if (q->head == q->tail)
255 		return false;
256 
257 	oskb = choke_peek_random(q, pidx);
258 	if (rcu_access_pointer(q->filter_list))
259 		return choke_get_classid(nskb) == choke_get_classid(oskb);
260 
261 	return choke_match_flow(oskb, nskb);
262 }
263 
264 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
265 {
266 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
267 	struct choke_sched_data *q = qdisc_priv(sch);
268 	const struct red_parms *p = &q->parms;
269 
270 	if (rcu_access_pointer(q->filter_list)) {
271 		/* If using external classifiers, get result and record it. */
272 		if (!choke_classify(skb, sch, &ret))
273 			goto other_drop;	/* Packet was eaten by filter */
274 	}
275 
276 	choke_skb_cb(skb)->keys_valid = 0;
277 	/* Compute average queue usage (see RED) */
278 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
279 	if (red_is_idling(&q->vars))
280 		red_end_of_idle_period(&q->vars);
281 
282 	/* Is queue small? */
283 	if (q->vars.qavg <= p->qth_min)
284 		q->vars.qcount = -1;
285 	else {
286 		unsigned int idx;
287 
288 		/* Draw a packet at random from queue and compare flow */
289 		if (choke_match_random(q, skb, &idx)) {
290 			q->stats.matched++;
291 			choke_drop_by_idx(sch, idx);
292 			goto congestion_drop;
293 		}
294 
295 		/* Queue is large, always mark/drop */
296 		if (q->vars.qavg > p->qth_max) {
297 			q->vars.qcount = -1;
298 
299 			qdisc_qstats_overlimit(sch);
300 			if (use_harddrop(q) || !use_ecn(q) ||
301 			    !INET_ECN_set_ce(skb)) {
302 				q->stats.forced_drop++;
303 				goto congestion_drop;
304 			}
305 
306 			q->stats.forced_mark++;
307 		} else if (++q->vars.qcount) {
308 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
309 				q->vars.qcount = 0;
310 				q->vars.qR = red_random(p);
311 
312 				qdisc_qstats_overlimit(sch);
313 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
314 					q->stats.prob_drop++;
315 					goto congestion_drop;
316 				}
317 
318 				q->stats.prob_mark++;
319 			}
320 		} else
321 			q->vars.qR = red_random(p);
322 	}
323 
324 	/* Admit new packet */
325 	if (sch->q.qlen < q->limit) {
326 		q->tab[q->tail] = skb;
327 		q->tail = (q->tail + 1) & q->tab_mask;
328 		++sch->q.qlen;
329 		qdisc_qstats_backlog_inc(sch, skb);
330 		return NET_XMIT_SUCCESS;
331 	}
332 
333 	q->stats.pdrop++;
334 	return qdisc_drop(skb, sch);
335 
336 congestion_drop:
337 	qdisc_drop(skb, sch);
338 	return NET_XMIT_CN;
339 
340 other_drop:
341 	if (ret & __NET_XMIT_BYPASS)
342 		qdisc_qstats_drop(sch);
343 	kfree_skb(skb);
344 	return ret;
345 }
346 
347 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
348 {
349 	struct choke_sched_data *q = qdisc_priv(sch);
350 	struct sk_buff *skb;
351 
352 	if (q->head == q->tail) {
353 		if (!red_is_idling(&q->vars))
354 			red_start_of_idle_period(&q->vars);
355 		return NULL;
356 	}
357 
358 	skb = q->tab[q->head];
359 	q->tab[q->head] = NULL;
360 	choke_zap_head_holes(q);
361 	--sch->q.qlen;
362 	qdisc_qstats_backlog_dec(sch, skb);
363 	qdisc_bstats_update(sch, skb);
364 
365 	return skb;
366 }
367 
368 static unsigned int choke_drop(struct Qdisc *sch)
369 {
370 	struct choke_sched_data *q = qdisc_priv(sch);
371 	unsigned int len;
372 
373 	len = qdisc_queue_drop(sch);
374 	if (len > 0)
375 		q->stats.other++;
376 	else {
377 		if (!red_is_idling(&q->vars))
378 			red_start_of_idle_period(&q->vars);
379 	}
380 
381 	return len;
382 }
383 
384 static void choke_reset(struct Qdisc *sch)
385 {
386 	struct choke_sched_data *q = qdisc_priv(sch);
387 
388 	red_restart(&q->vars);
389 }
390 
391 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
392 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
393 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
394 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
395 };
396 
397 
398 static void choke_free(void *addr)
399 {
400 	kvfree(addr);
401 }
402 
403 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
404 {
405 	struct choke_sched_data *q = qdisc_priv(sch);
406 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
407 	const struct tc_red_qopt *ctl;
408 	int err;
409 	struct sk_buff **old = NULL;
410 	unsigned int mask;
411 	u32 max_P;
412 
413 	if (opt == NULL)
414 		return -EINVAL;
415 
416 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
417 	if (err < 0)
418 		return err;
419 
420 	if (tb[TCA_CHOKE_PARMS] == NULL ||
421 	    tb[TCA_CHOKE_STAB] == NULL)
422 		return -EINVAL;
423 
424 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
425 
426 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
427 
428 	if (ctl->limit > CHOKE_MAX_QUEUE)
429 		return -EINVAL;
430 
431 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
432 	if (mask != q->tab_mask) {
433 		struct sk_buff **ntab;
434 
435 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
436 			       GFP_KERNEL | __GFP_NOWARN);
437 		if (!ntab)
438 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
439 		if (!ntab)
440 			return -ENOMEM;
441 
442 		sch_tree_lock(sch);
443 		old = q->tab;
444 		if (old) {
445 			unsigned int oqlen = sch->q.qlen, tail = 0;
446 
447 			while (q->head != q->tail) {
448 				struct sk_buff *skb = q->tab[q->head];
449 
450 				q->head = (q->head + 1) & q->tab_mask;
451 				if (!skb)
452 					continue;
453 				if (tail < mask) {
454 					ntab[tail++] = skb;
455 					continue;
456 				}
457 				qdisc_qstats_backlog_dec(sch, skb);
458 				--sch->q.qlen;
459 				qdisc_drop(skb, sch);
460 			}
461 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
462 			q->head = 0;
463 			q->tail = tail;
464 		}
465 
466 		q->tab_mask = mask;
467 		q->tab = ntab;
468 	} else
469 		sch_tree_lock(sch);
470 
471 	q->flags = ctl->flags;
472 	q->limit = ctl->limit;
473 
474 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
475 		      ctl->Plog, ctl->Scell_log,
476 		      nla_data(tb[TCA_CHOKE_STAB]),
477 		      max_P);
478 	red_set_vars(&q->vars);
479 
480 	if (q->head == q->tail)
481 		red_end_of_idle_period(&q->vars);
482 
483 	sch_tree_unlock(sch);
484 	choke_free(old);
485 	return 0;
486 }
487 
488 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
489 {
490 	return choke_change(sch, opt);
491 }
492 
493 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
494 {
495 	struct choke_sched_data *q = qdisc_priv(sch);
496 	struct nlattr *opts = NULL;
497 	struct tc_red_qopt opt = {
498 		.limit		= q->limit,
499 		.flags		= q->flags,
500 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
501 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
502 		.Wlog		= q->parms.Wlog,
503 		.Plog		= q->parms.Plog,
504 		.Scell_log	= q->parms.Scell_log,
505 	};
506 
507 	opts = nla_nest_start(skb, TCA_OPTIONS);
508 	if (opts == NULL)
509 		goto nla_put_failure;
510 
511 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
512 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
513 		goto nla_put_failure;
514 	return nla_nest_end(skb, opts);
515 
516 nla_put_failure:
517 	nla_nest_cancel(skb, opts);
518 	return -EMSGSIZE;
519 }
520 
521 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
522 {
523 	struct choke_sched_data *q = qdisc_priv(sch);
524 	struct tc_choke_xstats st = {
525 		.early	= q->stats.prob_drop + q->stats.forced_drop,
526 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
527 		.pdrop	= q->stats.pdrop,
528 		.other	= q->stats.other,
529 		.matched = q->stats.matched,
530 	};
531 
532 	return gnet_stats_copy_app(d, &st, sizeof(st));
533 }
534 
535 static void choke_destroy(struct Qdisc *sch)
536 {
537 	struct choke_sched_data *q = qdisc_priv(sch);
538 
539 	tcf_destroy_chain(&q->filter_list);
540 	choke_free(q->tab);
541 }
542 
543 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
544 {
545 	return NULL;
546 }
547 
548 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
549 {
550 	return 0;
551 }
552 
553 static void choke_put(struct Qdisc *q, unsigned long cl)
554 {
555 }
556 
557 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
558 				u32 classid)
559 {
560 	return 0;
561 }
562 
563 static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
564 					       unsigned long cl)
565 {
566 	struct choke_sched_data *q = qdisc_priv(sch);
567 
568 	if (cl)
569 		return NULL;
570 	return &q->filter_list;
571 }
572 
573 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
574 			  struct sk_buff *skb, struct tcmsg *tcm)
575 {
576 	tcm->tcm_handle |= TC_H_MIN(cl);
577 	return 0;
578 }
579 
580 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
581 {
582 	if (!arg->stop) {
583 		if (arg->fn(sch, 1, arg) < 0) {
584 			arg->stop = 1;
585 			return;
586 		}
587 		arg->count++;
588 	}
589 }
590 
591 static const struct Qdisc_class_ops choke_class_ops = {
592 	.leaf		=	choke_leaf,
593 	.get		=	choke_get,
594 	.put		=	choke_put,
595 	.tcf_chain	=	choke_find_tcf,
596 	.bind_tcf	=	choke_bind,
597 	.unbind_tcf	=	choke_put,
598 	.dump		=	choke_dump_class,
599 	.walk		=	choke_walk,
600 };
601 
602 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
603 {
604 	struct choke_sched_data *q = qdisc_priv(sch);
605 
606 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
607 }
608 
609 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
610 	.id		=	"choke",
611 	.priv_size	=	sizeof(struct choke_sched_data),
612 
613 	.enqueue	=	choke_enqueue,
614 	.dequeue	=	choke_dequeue,
615 	.peek		=	choke_peek_head,
616 	.drop		=	choke_drop,
617 	.init		=	choke_init,
618 	.destroy	=	choke_destroy,
619 	.reset		=	choke_reset,
620 	.change		=	choke_change,
621 	.dump		=	choke_dump,
622 	.dump_stats	=	choke_dump_stats,
623 	.owner		=	THIS_MODULE,
624 };
625 
626 static int __init choke_module_init(void)
627 {
628 	return register_qdisc(&choke_qdisc_ops);
629 }
630 
631 static void __exit choke_module_exit(void)
632 {
633 	unregister_qdisc(&choke_qdisc_ops);
634 }
635 
636 module_init(choke_module_init)
637 module_exit(choke_module_exit)
638 
639 MODULE_LICENSE("GPL");
640