xref: /openbmc/linux/net/sched/sch_choke.c (revision 93d90ad7)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_keys.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto __rcu *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
119 {
120 	struct choke_sched_data *q = qdisc_priv(sch);
121 	struct sk_buff *skb = q->tab[idx];
122 
123 	q->tab[idx] = NULL;
124 
125 	if (idx == q->head)
126 		choke_zap_head_holes(q);
127 	if (idx == q->tail)
128 		choke_zap_tail_holes(q);
129 
130 	qdisc_qstats_backlog_dec(sch, skb);
131 	qdisc_drop(skb, sch);
132 	qdisc_tree_decrease_qlen(sch, 1);
133 	--sch->q.qlen;
134 }
135 
136 /* private part of skb->cb[] that a qdisc is allowed to use
137  * is limited to QDISC_CB_PRIV_LEN bytes.
138  * As a flow key might be too large, we store a part of it only.
139  */
140 #define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
141 
142 struct choke_skb_cb {
143 	u16			classid;
144 	u8			keys_valid;
145 	u8			keys[QDISC_CB_PRIV_LEN - 3];
146 };
147 
148 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
149 {
150 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
151 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
152 }
153 
154 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
155 {
156 	choke_skb_cb(skb)->classid = classid;
157 }
158 
159 static u16 choke_get_classid(const struct sk_buff *skb)
160 {
161 	return choke_skb_cb(skb)->classid;
162 }
163 
164 /*
165  * Compare flow of two packets
166  *  Returns true only if source and destination address and port match.
167  *          false for special cases
168  */
169 static bool choke_match_flow(struct sk_buff *skb1,
170 			     struct sk_buff *skb2)
171 {
172 	struct flow_keys temp;
173 
174 	if (skb1->protocol != skb2->protocol)
175 		return false;
176 
177 	if (!choke_skb_cb(skb1)->keys_valid) {
178 		choke_skb_cb(skb1)->keys_valid = 1;
179 		skb_flow_dissect(skb1, &temp);
180 		memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
181 	}
182 
183 	if (!choke_skb_cb(skb2)->keys_valid) {
184 		choke_skb_cb(skb2)->keys_valid = 1;
185 		skb_flow_dissect(skb2, &temp);
186 		memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
187 	}
188 
189 	return !memcmp(&choke_skb_cb(skb1)->keys,
190 		       &choke_skb_cb(skb2)->keys,
191 		       CHOKE_K_LEN);
192 }
193 
194 /*
195  * Classify flow using either:
196  *  1. pre-existing classification result in skb
197  *  2. fast internal classification
198  *  3. use TC filter based classification
199  */
200 static bool choke_classify(struct sk_buff *skb,
201 			   struct Qdisc *sch, int *qerr)
202 
203 {
204 	struct choke_sched_data *q = qdisc_priv(sch);
205 	struct tcf_result res;
206 	struct tcf_proto *fl;
207 	int result;
208 
209 	fl = rcu_dereference_bh(q->filter_list);
210 	result = tc_classify(skb, fl, &res);
211 	if (result >= 0) {
212 #ifdef CONFIG_NET_CLS_ACT
213 		switch (result) {
214 		case TC_ACT_STOLEN:
215 		case TC_ACT_QUEUED:
216 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
217 		case TC_ACT_SHOT:
218 			return false;
219 		}
220 #endif
221 		choke_set_classid(skb, TC_H_MIN(res.classid));
222 		return true;
223 	}
224 
225 	return false;
226 }
227 
228 /*
229  * Select a packet at random from queue
230  * HACK: since queue can have holes from previous deletion; retry several
231  *   times to find a random skb but then just give up and return the head
232  * Will return NULL if queue is empty (q->head == q->tail)
233  */
234 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
235 					 unsigned int *pidx)
236 {
237 	struct sk_buff *skb;
238 	int retrys = 3;
239 
240 	do {
241 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
242 		skb = q->tab[*pidx];
243 		if (skb)
244 			return skb;
245 	} while (--retrys > 0);
246 
247 	return q->tab[*pidx = q->head];
248 }
249 
250 /*
251  * Compare new packet with random packet in queue
252  * returns true if matched and sets *pidx
253  */
254 static bool choke_match_random(const struct choke_sched_data *q,
255 			       struct sk_buff *nskb,
256 			       unsigned int *pidx)
257 {
258 	struct sk_buff *oskb;
259 
260 	if (q->head == q->tail)
261 		return false;
262 
263 	oskb = choke_peek_random(q, pidx);
264 	if (rcu_access_pointer(q->filter_list))
265 		return choke_get_classid(nskb) == choke_get_classid(oskb);
266 
267 	return choke_match_flow(oskb, nskb);
268 }
269 
270 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
271 {
272 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
273 	struct choke_sched_data *q = qdisc_priv(sch);
274 	const struct red_parms *p = &q->parms;
275 
276 	if (rcu_access_pointer(q->filter_list)) {
277 		/* If using external classifiers, get result and record it. */
278 		if (!choke_classify(skb, sch, &ret))
279 			goto other_drop;	/* Packet was eaten by filter */
280 	}
281 
282 	choke_skb_cb(skb)->keys_valid = 0;
283 	/* Compute average queue usage (see RED) */
284 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
285 	if (red_is_idling(&q->vars))
286 		red_end_of_idle_period(&q->vars);
287 
288 	/* Is queue small? */
289 	if (q->vars.qavg <= p->qth_min)
290 		q->vars.qcount = -1;
291 	else {
292 		unsigned int idx;
293 
294 		/* Draw a packet at random from queue and compare flow */
295 		if (choke_match_random(q, skb, &idx)) {
296 			q->stats.matched++;
297 			choke_drop_by_idx(sch, idx);
298 			goto congestion_drop;
299 		}
300 
301 		/* Queue is large, always mark/drop */
302 		if (q->vars.qavg > p->qth_max) {
303 			q->vars.qcount = -1;
304 
305 			qdisc_qstats_overlimit(sch);
306 			if (use_harddrop(q) || !use_ecn(q) ||
307 			    !INET_ECN_set_ce(skb)) {
308 				q->stats.forced_drop++;
309 				goto congestion_drop;
310 			}
311 
312 			q->stats.forced_mark++;
313 		} else if (++q->vars.qcount) {
314 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
315 				q->vars.qcount = 0;
316 				q->vars.qR = red_random(p);
317 
318 				qdisc_qstats_overlimit(sch);
319 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
320 					q->stats.prob_drop++;
321 					goto congestion_drop;
322 				}
323 
324 				q->stats.prob_mark++;
325 			}
326 		} else
327 			q->vars.qR = red_random(p);
328 	}
329 
330 	/* Admit new packet */
331 	if (sch->q.qlen < q->limit) {
332 		q->tab[q->tail] = skb;
333 		q->tail = (q->tail + 1) & q->tab_mask;
334 		++sch->q.qlen;
335 		qdisc_qstats_backlog_inc(sch, skb);
336 		return NET_XMIT_SUCCESS;
337 	}
338 
339 	q->stats.pdrop++;
340 	return qdisc_drop(skb, sch);
341 
342 congestion_drop:
343 	qdisc_drop(skb, sch);
344 	return NET_XMIT_CN;
345 
346 other_drop:
347 	if (ret & __NET_XMIT_BYPASS)
348 		qdisc_qstats_drop(sch);
349 	kfree_skb(skb);
350 	return ret;
351 }
352 
353 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
354 {
355 	struct choke_sched_data *q = qdisc_priv(sch);
356 	struct sk_buff *skb;
357 
358 	if (q->head == q->tail) {
359 		if (!red_is_idling(&q->vars))
360 			red_start_of_idle_period(&q->vars);
361 		return NULL;
362 	}
363 
364 	skb = q->tab[q->head];
365 	q->tab[q->head] = NULL;
366 	choke_zap_head_holes(q);
367 	--sch->q.qlen;
368 	qdisc_qstats_backlog_dec(sch, skb);
369 	qdisc_bstats_update(sch, skb);
370 
371 	return skb;
372 }
373 
374 static unsigned int choke_drop(struct Qdisc *sch)
375 {
376 	struct choke_sched_data *q = qdisc_priv(sch);
377 	unsigned int len;
378 
379 	len = qdisc_queue_drop(sch);
380 	if (len > 0)
381 		q->stats.other++;
382 	else {
383 		if (!red_is_idling(&q->vars))
384 			red_start_of_idle_period(&q->vars);
385 	}
386 
387 	return len;
388 }
389 
390 static void choke_reset(struct Qdisc *sch)
391 {
392 	struct choke_sched_data *q = qdisc_priv(sch);
393 
394 	red_restart(&q->vars);
395 }
396 
397 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
398 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
399 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
400 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
401 };
402 
403 
404 static void choke_free(void *addr)
405 {
406 	kvfree(addr);
407 }
408 
409 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
410 {
411 	struct choke_sched_data *q = qdisc_priv(sch);
412 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
413 	const struct tc_red_qopt *ctl;
414 	int err;
415 	struct sk_buff **old = NULL;
416 	unsigned int mask;
417 	u32 max_P;
418 
419 	if (opt == NULL)
420 		return -EINVAL;
421 
422 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
423 	if (err < 0)
424 		return err;
425 
426 	if (tb[TCA_CHOKE_PARMS] == NULL ||
427 	    tb[TCA_CHOKE_STAB] == NULL)
428 		return -EINVAL;
429 
430 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
431 
432 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
433 
434 	if (ctl->limit > CHOKE_MAX_QUEUE)
435 		return -EINVAL;
436 
437 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
438 	if (mask != q->tab_mask) {
439 		struct sk_buff **ntab;
440 
441 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
442 			       GFP_KERNEL | __GFP_NOWARN);
443 		if (!ntab)
444 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
445 		if (!ntab)
446 			return -ENOMEM;
447 
448 		sch_tree_lock(sch);
449 		old = q->tab;
450 		if (old) {
451 			unsigned int oqlen = sch->q.qlen, tail = 0;
452 
453 			while (q->head != q->tail) {
454 				struct sk_buff *skb = q->tab[q->head];
455 
456 				q->head = (q->head + 1) & q->tab_mask;
457 				if (!skb)
458 					continue;
459 				if (tail < mask) {
460 					ntab[tail++] = skb;
461 					continue;
462 				}
463 				qdisc_qstats_backlog_dec(sch, skb);
464 				--sch->q.qlen;
465 				qdisc_drop(skb, sch);
466 			}
467 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
468 			q->head = 0;
469 			q->tail = tail;
470 		}
471 
472 		q->tab_mask = mask;
473 		q->tab = ntab;
474 	} else
475 		sch_tree_lock(sch);
476 
477 	q->flags = ctl->flags;
478 	q->limit = ctl->limit;
479 
480 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
481 		      ctl->Plog, ctl->Scell_log,
482 		      nla_data(tb[TCA_CHOKE_STAB]),
483 		      max_P);
484 	red_set_vars(&q->vars);
485 
486 	if (q->head == q->tail)
487 		red_end_of_idle_period(&q->vars);
488 
489 	sch_tree_unlock(sch);
490 	choke_free(old);
491 	return 0;
492 }
493 
494 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
495 {
496 	return choke_change(sch, opt);
497 }
498 
499 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
500 {
501 	struct choke_sched_data *q = qdisc_priv(sch);
502 	struct nlattr *opts = NULL;
503 	struct tc_red_qopt opt = {
504 		.limit		= q->limit,
505 		.flags		= q->flags,
506 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
507 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
508 		.Wlog		= q->parms.Wlog,
509 		.Plog		= q->parms.Plog,
510 		.Scell_log	= q->parms.Scell_log,
511 	};
512 
513 	opts = nla_nest_start(skb, TCA_OPTIONS);
514 	if (opts == NULL)
515 		goto nla_put_failure;
516 
517 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
518 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
519 		goto nla_put_failure;
520 	return nla_nest_end(skb, opts);
521 
522 nla_put_failure:
523 	nla_nest_cancel(skb, opts);
524 	return -EMSGSIZE;
525 }
526 
527 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
528 {
529 	struct choke_sched_data *q = qdisc_priv(sch);
530 	struct tc_choke_xstats st = {
531 		.early	= q->stats.prob_drop + q->stats.forced_drop,
532 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
533 		.pdrop	= q->stats.pdrop,
534 		.other	= q->stats.other,
535 		.matched = q->stats.matched,
536 	};
537 
538 	return gnet_stats_copy_app(d, &st, sizeof(st));
539 }
540 
541 static void choke_destroy(struct Qdisc *sch)
542 {
543 	struct choke_sched_data *q = qdisc_priv(sch);
544 
545 	tcf_destroy_chain(&q->filter_list);
546 	choke_free(q->tab);
547 }
548 
549 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
550 {
551 	return NULL;
552 }
553 
554 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
555 {
556 	return 0;
557 }
558 
559 static void choke_put(struct Qdisc *q, unsigned long cl)
560 {
561 }
562 
563 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
564 				u32 classid)
565 {
566 	return 0;
567 }
568 
569 static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
570 					       unsigned long cl)
571 {
572 	struct choke_sched_data *q = qdisc_priv(sch);
573 
574 	if (cl)
575 		return NULL;
576 	return &q->filter_list;
577 }
578 
579 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
580 			  struct sk_buff *skb, struct tcmsg *tcm)
581 {
582 	tcm->tcm_handle |= TC_H_MIN(cl);
583 	return 0;
584 }
585 
586 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
587 {
588 	if (!arg->stop) {
589 		if (arg->fn(sch, 1, arg) < 0) {
590 			arg->stop = 1;
591 			return;
592 		}
593 		arg->count++;
594 	}
595 }
596 
597 static const struct Qdisc_class_ops choke_class_ops = {
598 	.leaf		=	choke_leaf,
599 	.get		=	choke_get,
600 	.put		=	choke_put,
601 	.tcf_chain	=	choke_find_tcf,
602 	.bind_tcf	=	choke_bind,
603 	.unbind_tcf	=	choke_put,
604 	.dump		=	choke_dump_class,
605 	.walk		=	choke_walk,
606 };
607 
608 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
609 {
610 	struct choke_sched_data *q = qdisc_priv(sch);
611 
612 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
613 }
614 
615 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
616 	.id		=	"choke",
617 	.priv_size	=	sizeof(struct choke_sched_data),
618 
619 	.enqueue	=	choke_enqueue,
620 	.dequeue	=	choke_dequeue,
621 	.peek		=	choke_peek_head,
622 	.drop		=	choke_drop,
623 	.init		=	choke_init,
624 	.destroy	=	choke_destroy,
625 	.reset		=	choke_reset,
626 	.change		=	choke_change,
627 	.dump		=	choke_dump,
628 	.dump_stats	=	choke_dump_stats,
629 	.owner		=	THIS_MODULE,
630 };
631 
632 static int __init choke_module_init(void)
633 {
634 	return register_qdisc(&choke_qdisc_ops);
635 }
636 
637 static void __exit choke_module_exit(void)
638 {
639 	unregister_qdisc(&choke_qdisc_ops);
640 }
641 
642 module_init(choke_module_init)
643 module_exit(choke_module_exit)
644 
645 MODULE_LICENSE("GPL");
646