xref: /openbmc/linux/net/sched/sch_choke.c (revision 565d76cb)
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/reciprocal_div.h>
18 #include <linux/vmalloc.h>
19 #include <net/pkt_sched.h>
20 #include <net/inet_ecn.h>
21 #include <net/red.h>
22 #include <linux/ip.h>
23 #include <net/ip.h>
24 #include <linux/ipv6.h>
25 #include <net/ipv6.h>
26 
27 /*
28    CHOKe stateless AQM for fair bandwidth allocation
29    =================================================
30 
31    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
32    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
33    maintains no flow state. The difference from RED is an additional step
34    during the enqueuing process. If average queue size is over the
35    low threshold (qmin), a packet is chosen at random from the queue.
36    If both the new and chosen packet are from the same flow, both
37    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
38    needs to access packets in queue randomly. It has a minimal class
39    interface to allow overriding the builtin flow classifier with
40    filters.
41 
42    Source:
43    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
44    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
45    IEEE INFOCOM, 2000.
46 
47    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
48    Characteristics", IEEE/ACM Transactions on Networking, 2004
49 
50  */
51 
52 /* Upper bound on size of sk_buff table (packets) */
53 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
54 
55 struct choke_sched_data {
56 /* Parameters */
57 	u32		 limit;
58 	unsigned char	 flags;
59 
60 	struct red_parms parms;
61 
62 /* Variables */
63 	struct tcf_proto *filter_list;
64 	struct {
65 		u32	prob_drop;	/* Early probability drops */
66 		u32	prob_mark;	/* Early probability marks */
67 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
68 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
69 		u32	pdrop;          /* Drops due to queue limits */
70 		u32	other;          /* Drops due to drop() calls */
71 		u32	matched;	/* Drops to flow match */
72 	} stats;
73 
74 	unsigned int	 head;
75 	unsigned int	 tail;
76 
77 	unsigned int	 tab_mask; /* size - 1 */
78 
79 	struct sk_buff **tab;
80 };
81 
82 /* deliver a random number between 0 and N - 1 */
83 static u32 random_N(unsigned int N)
84 {
85 	return reciprocal_divide(random32(), N);
86 }
87 
88 /* number of elements in queue including holes */
89 static unsigned int choke_len(const struct choke_sched_data *q)
90 {
91 	return (q->tail - q->head) & q->tab_mask;
92 }
93 
94 /* Is ECN parameter configured */
95 static int use_ecn(const struct choke_sched_data *q)
96 {
97 	return q->flags & TC_RED_ECN;
98 }
99 
100 /* Should packets over max just be dropped (versus marked) */
101 static int use_harddrop(const struct choke_sched_data *q)
102 {
103 	return q->flags & TC_RED_HARDDROP;
104 }
105 
106 /* Move head pointer forward to skip over holes */
107 static void choke_zap_head_holes(struct choke_sched_data *q)
108 {
109 	do {
110 		q->head = (q->head + 1) & q->tab_mask;
111 		if (q->head == q->tail)
112 			break;
113 	} while (q->tab[q->head] == NULL);
114 }
115 
116 /* Move tail pointer backwards to reuse holes */
117 static void choke_zap_tail_holes(struct choke_sched_data *q)
118 {
119 	do {
120 		q->tail = (q->tail - 1) & q->tab_mask;
121 		if (q->head == q->tail)
122 			break;
123 	} while (q->tab[q->tail] == NULL);
124 }
125 
126 /* Drop packet from queue array by creating a "hole" */
127 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
128 {
129 	struct choke_sched_data *q = qdisc_priv(sch);
130 	struct sk_buff *skb = q->tab[idx];
131 
132 	q->tab[idx] = NULL;
133 
134 	if (idx == q->head)
135 		choke_zap_head_holes(q);
136 	if (idx == q->tail)
137 		choke_zap_tail_holes(q);
138 
139 	sch->qstats.backlog -= qdisc_pkt_len(skb);
140 	qdisc_drop(skb, sch);
141 	qdisc_tree_decrease_qlen(sch, 1);
142 	--sch->q.qlen;
143 }
144 
145 /*
146  * Compare flow of two packets
147  *  Returns true only if source and destination address and port match.
148  *          false for special cases
149  */
150 static bool choke_match_flow(struct sk_buff *skb1,
151 			     struct sk_buff *skb2)
152 {
153 	int off1, off2, poff;
154 	const u32 *ports1, *ports2;
155 	u8 ip_proto;
156 	__u32 hash1;
157 
158 	if (skb1->protocol != skb2->protocol)
159 		return false;
160 
161 	/* Use hash value as quick check
162 	 * Assumes that __skb_get_rxhash makes IP header and ports linear
163 	 */
164 	hash1 = skb_get_rxhash(skb1);
165 	if (!hash1 || hash1 != skb_get_rxhash(skb2))
166 		return false;
167 
168 	/* Probably match, but be sure to avoid hash collisions */
169 	off1 = skb_network_offset(skb1);
170 	off2 = skb_network_offset(skb2);
171 
172 	switch (skb1->protocol) {
173 	case __constant_htons(ETH_P_IP): {
174 		const struct iphdr *ip1, *ip2;
175 
176 		ip1 = (const struct iphdr *) (skb1->data + off1);
177 		ip2 = (const struct iphdr *) (skb2->data + off2);
178 
179 		ip_proto = ip1->protocol;
180 		if (ip_proto != ip2->protocol ||
181 		    ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
182 			return false;
183 
184 		if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
185 			ip_proto = 0;
186 		off1 += ip1->ihl * 4;
187 		off2 += ip2->ihl * 4;
188 		break;
189 	}
190 
191 	case __constant_htons(ETH_P_IPV6): {
192 		const struct ipv6hdr *ip1, *ip2;
193 
194 		ip1 = (const struct ipv6hdr *) (skb1->data + off1);
195 		ip2 = (const struct ipv6hdr *) (skb2->data + off2);
196 
197 		ip_proto = ip1->nexthdr;
198 		if (ip_proto != ip2->nexthdr ||
199 		    ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
200 		    ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
201 			return false;
202 		off1 += 40;
203 		off2 += 40;
204 	}
205 
206 	default: /* Maybe compare MAC header here? */
207 		return false;
208 	}
209 
210 	poff = proto_ports_offset(ip_proto);
211 	if (poff < 0)
212 		return true;
213 
214 	off1 += poff;
215 	off2 += poff;
216 
217 	ports1 = (__force u32 *)(skb1->data + off1);
218 	ports2 = (__force u32 *)(skb2->data + off2);
219 	return *ports1 == *ports2;
220 }
221 
222 struct choke_skb_cb {
223 	u16 classid;
224 };
225 
226 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
227 {
228 	BUILD_BUG_ON(sizeof(skb->cb) <
229 		sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb));
230 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
231 }
232 
233 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
234 {
235 	choke_skb_cb(skb)->classid = classid;
236 }
237 
238 static u16 choke_get_classid(const struct sk_buff *skb)
239 {
240 	return choke_skb_cb(skb)->classid;
241 }
242 
243 /*
244  * Classify flow using either:
245  *  1. pre-existing classification result in skb
246  *  2. fast internal classification
247  *  3. use TC filter based classification
248  */
249 static bool choke_classify(struct sk_buff *skb,
250 			   struct Qdisc *sch, int *qerr)
251 
252 {
253 	struct choke_sched_data *q = qdisc_priv(sch);
254 	struct tcf_result res;
255 	int result;
256 
257 	result = tc_classify(skb, q->filter_list, &res);
258 	if (result >= 0) {
259 #ifdef CONFIG_NET_CLS_ACT
260 		switch (result) {
261 		case TC_ACT_STOLEN:
262 		case TC_ACT_QUEUED:
263 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
264 		case TC_ACT_SHOT:
265 			return false;
266 		}
267 #endif
268 		choke_set_classid(skb, TC_H_MIN(res.classid));
269 		return true;
270 	}
271 
272 	return false;
273 }
274 
275 /*
276  * Select a packet at random from queue
277  * HACK: since queue can have holes from previous deletion; retry several
278  *   times to find a random skb but then just give up and return the head
279  * Will return NULL if queue is empty (q->head == q->tail)
280  */
281 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
282 					 unsigned int *pidx)
283 {
284 	struct sk_buff *skb;
285 	int retrys = 3;
286 
287 	do {
288 		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
289 		skb = q->tab[*pidx];
290 		if (skb)
291 			return skb;
292 	} while (--retrys > 0);
293 
294 	return q->tab[*pidx = q->head];
295 }
296 
297 /*
298  * Compare new packet with random packet in queue
299  * returns true if matched and sets *pidx
300  */
301 static bool choke_match_random(const struct choke_sched_data *q,
302 			       struct sk_buff *nskb,
303 			       unsigned int *pidx)
304 {
305 	struct sk_buff *oskb;
306 
307 	if (q->head == q->tail)
308 		return false;
309 
310 	oskb = choke_peek_random(q, pidx);
311 	if (q->filter_list)
312 		return choke_get_classid(nskb) == choke_get_classid(oskb);
313 
314 	return choke_match_flow(oskb, nskb);
315 }
316 
317 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
318 {
319 	struct choke_sched_data *q = qdisc_priv(sch);
320 	struct red_parms *p = &q->parms;
321 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
322 
323 	if (q->filter_list) {
324 		/* If using external classifiers, get result and record it. */
325 		if (!choke_classify(skb, sch, &ret))
326 			goto other_drop;	/* Packet was eaten by filter */
327 	}
328 
329 	/* Compute average queue usage (see RED) */
330 	p->qavg = red_calc_qavg(p, sch->q.qlen);
331 	if (red_is_idling(p))
332 		red_end_of_idle_period(p);
333 
334 	/* Is queue small? */
335 	if (p->qavg <= p->qth_min)
336 		p->qcount = -1;
337 	else {
338 		unsigned int idx;
339 
340 		/* Draw a packet at random from queue and compare flow */
341 		if (choke_match_random(q, skb, &idx)) {
342 			q->stats.matched++;
343 			choke_drop_by_idx(sch, idx);
344 			goto congestion_drop;
345 		}
346 
347 		/* Queue is large, always mark/drop */
348 		if (p->qavg > p->qth_max) {
349 			p->qcount = -1;
350 
351 			sch->qstats.overlimits++;
352 			if (use_harddrop(q) || !use_ecn(q) ||
353 			    !INET_ECN_set_ce(skb)) {
354 				q->stats.forced_drop++;
355 				goto congestion_drop;
356 			}
357 
358 			q->stats.forced_mark++;
359 		} else if (++p->qcount) {
360 			if (red_mark_probability(p, p->qavg)) {
361 				p->qcount = 0;
362 				p->qR = red_random(p);
363 
364 				sch->qstats.overlimits++;
365 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
366 					q->stats.prob_drop++;
367 					goto congestion_drop;
368 				}
369 
370 				q->stats.prob_mark++;
371 			}
372 		} else
373 			p->qR = red_random(p);
374 	}
375 
376 	/* Admit new packet */
377 	if (sch->q.qlen < q->limit) {
378 		q->tab[q->tail] = skb;
379 		q->tail = (q->tail + 1) & q->tab_mask;
380 		++sch->q.qlen;
381 		sch->qstats.backlog += qdisc_pkt_len(skb);
382 		return NET_XMIT_SUCCESS;
383 	}
384 
385 	q->stats.pdrop++;
386 	sch->qstats.drops++;
387 	kfree_skb(skb);
388 	return NET_XMIT_DROP;
389 
390  congestion_drop:
391 	qdisc_drop(skb, sch);
392 	return NET_XMIT_CN;
393 
394  other_drop:
395 	if (ret & __NET_XMIT_BYPASS)
396 		sch->qstats.drops++;
397 	kfree_skb(skb);
398 	return ret;
399 }
400 
401 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
402 {
403 	struct choke_sched_data *q = qdisc_priv(sch);
404 	struct sk_buff *skb;
405 
406 	if (q->head == q->tail) {
407 		if (!red_is_idling(&q->parms))
408 			red_start_of_idle_period(&q->parms);
409 		return NULL;
410 	}
411 
412 	skb = q->tab[q->head];
413 	q->tab[q->head] = NULL;
414 	choke_zap_head_holes(q);
415 	--sch->q.qlen;
416 	sch->qstats.backlog -= qdisc_pkt_len(skb);
417 	qdisc_bstats_update(sch, skb);
418 
419 	return skb;
420 }
421 
422 static unsigned int choke_drop(struct Qdisc *sch)
423 {
424 	struct choke_sched_data *q = qdisc_priv(sch);
425 	unsigned int len;
426 
427 	len = qdisc_queue_drop(sch);
428 	if (len > 0)
429 		q->stats.other++;
430 	else {
431 		if (!red_is_idling(&q->parms))
432 			red_start_of_idle_period(&q->parms);
433 	}
434 
435 	return len;
436 }
437 
438 static void choke_reset(struct Qdisc *sch)
439 {
440 	struct choke_sched_data *q = qdisc_priv(sch);
441 
442 	red_restart(&q->parms);
443 }
444 
445 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
446 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
447 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
448 };
449 
450 
451 static void choke_free(void *addr)
452 {
453 	if (addr) {
454 		if (is_vmalloc_addr(addr))
455 			vfree(addr);
456 		else
457 			kfree(addr);
458 	}
459 }
460 
461 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
462 {
463 	struct choke_sched_data *q = qdisc_priv(sch);
464 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
465 	const struct tc_red_qopt *ctl;
466 	int err;
467 	struct sk_buff **old = NULL;
468 	unsigned int mask;
469 
470 	if (opt == NULL)
471 		return -EINVAL;
472 
473 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
474 	if (err < 0)
475 		return err;
476 
477 	if (tb[TCA_CHOKE_PARMS] == NULL ||
478 	    tb[TCA_CHOKE_STAB] == NULL)
479 		return -EINVAL;
480 
481 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
482 
483 	if (ctl->limit > CHOKE_MAX_QUEUE)
484 		return -EINVAL;
485 
486 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
487 	if (mask != q->tab_mask) {
488 		struct sk_buff **ntab;
489 
490 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
491 		if (!ntab)
492 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
493 		if (!ntab)
494 			return -ENOMEM;
495 
496 		sch_tree_lock(sch);
497 		old = q->tab;
498 		if (old) {
499 			unsigned int oqlen = sch->q.qlen, tail = 0;
500 
501 			while (q->head != q->tail) {
502 				struct sk_buff *skb = q->tab[q->head];
503 
504 				q->head = (q->head + 1) & q->tab_mask;
505 				if (!skb)
506 					continue;
507 				if (tail < mask) {
508 					ntab[tail++] = skb;
509 					continue;
510 				}
511 				sch->qstats.backlog -= qdisc_pkt_len(skb);
512 				--sch->q.qlen;
513 				qdisc_drop(skb, sch);
514 			}
515 			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
516 			q->head = 0;
517 			q->tail = tail;
518 		}
519 
520 		q->tab_mask = mask;
521 		q->tab = ntab;
522 	} else
523 		sch_tree_lock(sch);
524 
525 	q->flags = ctl->flags;
526 	q->limit = ctl->limit;
527 
528 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
529 		      ctl->Plog, ctl->Scell_log,
530 		      nla_data(tb[TCA_CHOKE_STAB]));
531 
532 	if (q->head == q->tail)
533 		red_end_of_idle_period(&q->parms);
534 
535 	sch_tree_unlock(sch);
536 	choke_free(old);
537 	return 0;
538 }
539 
540 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
541 {
542 	return choke_change(sch, opt);
543 }
544 
545 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
546 {
547 	struct choke_sched_data *q = qdisc_priv(sch);
548 	struct nlattr *opts = NULL;
549 	struct tc_red_qopt opt = {
550 		.limit		= q->limit,
551 		.flags		= q->flags,
552 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
553 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
554 		.Wlog		= q->parms.Wlog,
555 		.Plog		= q->parms.Plog,
556 		.Scell_log	= q->parms.Scell_log,
557 	};
558 
559 	opts = nla_nest_start(skb, TCA_OPTIONS);
560 	if (opts == NULL)
561 		goto nla_put_failure;
562 
563 	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
564 	return nla_nest_end(skb, opts);
565 
566 nla_put_failure:
567 	nla_nest_cancel(skb, opts);
568 	return -EMSGSIZE;
569 }
570 
571 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
572 {
573 	struct choke_sched_data *q = qdisc_priv(sch);
574 	struct tc_choke_xstats st = {
575 		.early	= q->stats.prob_drop + q->stats.forced_drop,
576 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
577 		.pdrop	= q->stats.pdrop,
578 		.other	= q->stats.other,
579 		.matched = q->stats.matched,
580 	};
581 
582 	return gnet_stats_copy_app(d, &st, sizeof(st));
583 }
584 
585 static void choke_destroy(struct Qdisc *sch)
586 {
587 	struct choke_sched_data *q = qdisc_priv(sch);
588 
589 	tcf_destroy_chain(&q->filter_list);
590 	choke_free(q->tab);
591 }
592 
593 static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
594 {
595 	return NULL;
596 }
597 
598 static unsigned long choke_get(struct Qdisc *sch, u32 classid)
599 {
600 	return 0;
601 }
602 
603 static void choke_put(struct Qdisc *q, unsigned long cl)
604 {
605 }
606 
607 static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
608 				u32 classid)
609 {
610 	return 0;
611 }
612 
613 static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
614 {
615 	struct choke_sched_data *q = qdisc_priv(sch);
616 
617 	if (cl)
618 		return NULL;
619 	return &q->filter_list;
620 }
621 
622 static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
623 			  struct sk_buff *skb, struct tcmsg *tcm)
624 {
625 	tcm->tcm_handle |= TC_H_MIN(cl);
626 	return 0;
627 }
628 
629 static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
630 {
631 	if (!arg->stop) {
632 		if (arg->fn(sch, 1, arg) < 0) {
633 			arg->stop = 1;
634 			return;
635 		}
636 		arg->count++;
637 	}
638 }
639 
640 static const struct Qdisc_class_ops choke_class_ops = {
641 	.leaf		=	choke_leaf,
642 	.get		=	choke_get,
643 	.put		=	choke_put,
644 	.tcf_chain	=	choke_find_tcf,
645 	.bind_tcf	=	choke_bind,
646 	.unbind_tcf	=	choke_put,
647 	.dump		=	choke_dump_class,
648 	.walk		=	choke_walk,
649 };
650 
651 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
652 {
653 	struct choke_sched_data *q = qdisc_priv(sch);
654 
655 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
656 }
657 
658 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
659 	.id		=	"choke",
660 	.priv_size	=	sizeof(struct choke_sched_data),
661 
662 	.enqueue	=	choke_enqueue,
663 	.dequeue	=	choke_dequeue,
664 	.peek		=	choke_peek_head,
665 	.drop		=	choke_drop,
666 	.init		=	choke_init,
667 	.destroy	=	choke_destroy,
668 	.reset		=	choke_reset,
669 	.change		=	choke_change,
670 	.dump		=	choke_dump,
671 	.dump_stats	=	choke_dump_stats,
672 	.owner		=	THIS_MODULE,
673 };
674 
675 static int __init choke_module_init(void)
676 {
677 	return register_qdisc(&choke_qdisc_ops);
678 }
679 
680 static void __exit choke_module_exit(void)
681 {
682 	unregister_qdisc(&choke_qdisc_ops);
683 }
684 
685 module_init(choke_module_init)
686 module_exit(choke_module_exit)
687 
688 MODULE_LICENSE("GPL");
689