xref: /openbmc/linux/net/sched/sch_sfq.c (revision b627b4ed)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/ipv6.h>
21 #include <linux/skbuff.h>
22 #include <linux/jhash.h>
23 #include <net/ip.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 
27 
28 /*	Stochastic Fairness Queuing algorithm.
29 	=======================================
30 
31 	Source:
32 	Paul E. McKenney "Stochastic Fairness Queuing",
33 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
34 
35 	Paul E. McKenney "Stochastic Fairness Queuing",
36 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
37 
38 
39 	See also:
40 	M. Shreedhar and George Varghese "Efficient Fair
41 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
42 
43 
44 	This is not the thing that is usually called (W)FQ nowadays.
45 	It does not use any timestamp mechanism, but instead
46 	processes queues in round-robin order.
47 
48 	ADVANTAGE:
49 
50 	- It is very cheap. Both CPU and memory requirements are minimal.
51 
52 	DRAWBACKS:
53 
54 	- "Stochastic" -> It is not 100% fair.
55 	When hash collisions occur, several flows are considered as one.
56 
57 	- "Round-robin" -> It introduces larger delays than virtual clock
58 	based schemes, and should not be used for isolating interactive
59 	traffic	from non-interactive. It means, that this scheduler
60 	should be used as leaf of CBQ or P3, which put interactive traffic
61 	to higher priority band.
62 
63 	We still need true WFQ for top level CSZ, but using WFQ
64 	for the best effort traffic is absolutely pointless:
65 	SFQ is superior for this purpose.
66 
67 	IMPLEMENTATION:
68 	This implementation limits maximal queue length to 128;
69 	maximal mtu to 2^15-1; number of hash buckets to 1024.
70 	The only goal of this restrictions was that all data
71 	fit into one 4K page :-). Struct sfq_sched_data is
72 	organized in anti-cache manner: all the data for a bucket
73 	are scattered over different locations. This is not good,
74 	but it allowed me to put it into 4K.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_DEPTH		128
79 #define SFQ_HASH_DIVISOR	1024
80 
81 /* This type should contain at least SFQ_DEPTH*2 values */
82 typedef unsigned char sfq_index;
83 
84 struct sfq_head
85 {
86 	sfq_index	next;
87 	sfq_index	prev;
88 };
89 
90 struct sfq_sched_data
91 {
92 /* Parameters */
93 	int		perturb_period;
94 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
95 	int		limit;
96 
97 /* Variables */
98 	struct tcf_proto *filter_list;
99 	struct timer_list perturb_timer;
100 	u32		perturbation;
101 	sfq_index	tail;		/* Index of current slot in round */
102 	sfq_index	max_depth;	/* Maximal depth */
103 
104 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
105 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
106 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
107 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
108 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
109 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
110 };
111 
112 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
113 {
114 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
115 }
116 
117 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
118 {
119 	u32 h, h2;
120 
121 	switch (skb->protocol) {
122 	case htons(ETH_P_IP):
123 	{
124 		const struct iphdr *iph = ip_hdr(skb);
125 		h = iph->daddr;
126 		h2 = iph->saddr ^ iph->protocol;
127 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
128 		    (iph->protocol == IPPROTO_TCP ||
129 		     iph->protocol == IPPROTO_UDP ||
130 		     iph->protocol == IPPROTO_UDPLITE ||
131 		     iph->protocol == IPPROTO_SCTP ||
132 		     iph->protocol == IPPROTO_DCCP ||
133 		     iph->protocol == IPPROTO_ESP))
134 			h2 ^= *(((u32*)iph) + iph->ihl);
135 		break;
136 	}
137 	case htons(ETH_P_IPV6):
138 	{
139 		struct ipv6hdr *iph = ipv6_hdr(skb);
140 		h = iph->daddr.s6_addr32[3];
141 		h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
142 		if (iph->nexthdr == IPPROTO_TCP ||
143 		    iph->nexthdr == IPPROTO_UDP ||
144 		    iph->nexthdr == IPPROTO_UDPLITE ||
145 		    iph->nexthdr == IPPROTO_SCTP ||
146 		    iph->nexthdr == IPPROTO_DCCP ||
147 		    iph->nexthdr == IPPROTO_ESP)
148 			h2 ^= *(u32*)&iph[1];
149 		break;
150 	}
151 	default:
152 		h = (unsigned long)skb->dst ^ skb->protocol;
153 		h2 = (unsigned long)skb->sk;
154 	}
155 
156 	return sfq_fold_hash(q, h, h2);
157 }
158 
159 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
160 				 int *qerr)
161 {
162 	struct sfq_sched_data *q = qdisc_priv(sch);
163 	struct tcf_result res;
164 	int result;
165 
166 	if (TC_H_MAJ(skb->priority) == sch->handle &&
167 	    TC_H_MIN(skb->priority) > 0 &&
168 	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
169 		return TC_H_MIN(skb->priority);
170 
171 	if (!q->filter_list)
172 		return sfq_hash(q, skb) + 1;
173 
174 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
175 	result = tc_classify(skb, q->filter_list, &res);
176 	if (result >= 0) {
177 #ifdef CONFIG_NET_CLS_ACT
178 		switch (result) {
179 		case TC_ACT_STOLEN:
180 		case TC_ACT_QUEUED:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 		case TC_ACT_SHOT:
183 			return 0;
184 		}
185 #endif
186 		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
187 			return TC_H_MIN(res.classid);
188 	}
189 	return 0;
190 }
191 
192 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
193 {
194 	sfq_index p, n;
195 	int d = q->qs[x].qlen + SFQ_DEPTH;
196 
197 	p = d;
198 	n = q->dep[d].next;
199 	q->dep[x].next = n;
200 	q->dep[x].prev = p;
201 	q->dep[p].next = q->dep[n].prev = x;
202 }
203 
204 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
205 {
206 	sfq_index p, n;
207 
208 	n = q->dep[x].next;
209 	p = q->dep[x].prev;
210 	q->dep[p].next = n;
211 	q->dep[n].prev = p;
212 
213 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
214 		q->max_depth--;
215 
216 	sfq_link(q, x);
217 }
218 
219 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
220 {
221 	sfq_index p, n;
222 	int d;
223 
224 	n = q->dep[x].next;
225 	p = q->dep[x].prev;
226 	q->dep[p].next = n;
227 	q->dep[n].prev = p;
228 	d = q->qs[x].qlen;
229 	if (q->max_depth < d)
230 		q->max_depth = d;
231 
232 	sfq_link(q, x);
233 }
234 
235 static unsigned int sfq_drop(struct Qdisc *sch)
236 {
237 	struct sfq_sched_data *q = qdisc_priv(sch);
238 	sfq_index d = q->max_depth;
239 	struct sk_buff *skb;
240 	unsigned int len;
241 
242 	/* Queue is full! Find the longest slot and
243 	   drop a packet from it */
244 
245 	if (d > 1) {
246 		sfq_index x = q->dep[d + SFQ_DEPTH].next;
247 		skb = q->qs[x].prev;
248 		len = qdisc_pkt_len(skb);
249 		__skb_unlink(skb, &q->qs[x]);
250 		kfree_skb(skb);
251 		sfq_dec(q, x);
252 		sch->q.qlen--;
253 		sch->qstats.drops++;
254 		sch->qstats.backlog -= len;
255 		return len;
256 	}
257 
258 	if (d == 1) {
259 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
260 		d = q->next[q->tail];
261 		q->next[q->tail] = q->next[d];
262 		q->allot[q->next[d]] += q->quantum;
263 		skb = q->qs[d].prev;
264 		len = qdisc_pkt_len(skb);
265 		__skb_unlink(skb, &q->qs[d]);
266 		kfree_skb(skb);
267 		sfq_dec(q, d);
268 		sch->q.qlen--;
269 		q->ht[q->hash[d]] = SFQ_DEPTH;
270 		sch->qstats.drops++;
271 		sch->qstats.backlog -= len;
272 		return len;
273 	}
274 
275 	return 0;
276 }
277 
278 static int
279 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
280 {
281 	struct sfq_sched_data *q = qdisc_priv(sch);
282 	unsigned int hash;
283 	sfq_index x;
284 	int uninitialized_var(ret);
285 
286 	hash = sfq_classify(skb, sch, &ret);
287 	if (hash == 0) {
288 		if (ret & __NET_XMIT_BYPASS)
289 			sch->qstats.drops++;
290 		kfree_skb(skb);
291 		return ret;
292 	}
293 	hash--;
294 
295 	x = q->ht[hash];
296 	if (x == SFQ_DEPTH) {
297 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
298 		q->hash[x] = hash;
299 	}
300 
301 	/* If selected queue has length q->limit, this means that
302 	 * all another queues are empty and that we do simple tail drop,
303 	 * i.e. drop _this_ packet.
304 	 */
305 	if (q->qs[x].qlen >= q->limit)
306 		return qdisc_drop(skb, sch);
307 
308 	sch->qstats.backlog += qdisc_pkt_len(skb);
309 	__skb_queue_tail(&q->qs[x], skb);
310 	sfq_inc(q, x);
311 	if (q->qs[x].qlen == 1) {		/* The flow is new */
312 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
313 			q->tail = x;
314 			q->next[x] = x;
315 			q->allot[x] = q->quantum;
316 		} else {
317 			q->next[x] = q->next[q->tail];
318 			q->next[q->tail] = x;
319 			q->tail = x;
320 		}
321 	}
322 	if (++sch->q.qlen <= q->limit) {
323 		sch->bstats.bytes += qdisc_pkt_len(skb);
324 		sch->bstats.packets++;
325 		return 0;
326 	}
327 
328 	sfq_drop(sch);
329 	return NET_XMIT_CN;
330 }
331 
332 static struct sk_buff *
333 sfq_peek(struct Qdisc *sch)
334 {
335 	struct sfq_sched_data *q = qdisc_priv(sch);
336 	sfq_index a;
337 
338 	/* No active slots */
339 	if (q->tail == SFQ_DEPTH)
340 		return NULL;
341 
342 	a = q->next[q->tail];
343 	return skb_peek(&q->qs[a]);
344 }
345 
346 static struct sk_buff *
347 sfq_dequeue(struct Qdisc *sch)
348 {
349 	struct sfq_sched_data *q = qdisc_priv(sch);
350 	struct sk_buff *skb;
351 	sfq_index a, old_a;
352 
353 	/* No active slots */
354 	if (q->tail == SFQ_DEPTH)
355 		return NULL;
356 
357 	a = old_a = q->next[q->tail];
358 
359 	/* Grab packet */
360 	skb = __skb_dequeue(&q->qs[a]);
361 	sfq_dec(q, a);
362 	sch->q.qlen--;
363 	sch->qstats.backlog -= qdisc_pkt_len(skb);
364 
365 	/* Is the slot empty? */
366 	if (q->qs[a].qlen == 0) {
367 		q->ht[q->hash[a]] = SFQ_DEPTH;
368 		a = q->next[a];
369 		if (a == old_a) {
370 			q->tail = SFQ_DEPTH;
371 			return skb;
372 		}
373 		q->next[q->tail] = a;
374 		q->allot[a] += q->quantum;
375 	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
376 		q->tail = a;
377 		a = q->next[a];
378 		q->allot[a] += q->quantum;
379 	}
380 	return skb;
381 }
382 
383 static void
384 sfq_reset(struct Qdisc *sch)
385 {
386 	struct sk_buff *skb;
387 
388 	while ((skb = sfq_dequeue(sch)) != NULL)
389 		kfree_skb(skb);
390 }
391 
392 static void sfq_perturbation(unsigned long arg)
393 {
394 	struct Qdisc *sch = (struct Qdisc *)arg;
395 	struct sfq_sched_data *q = qdisc_priv(sch);
396 
397 	q->perturbation = net_random();
398 
399 	if (q->perturb_period)
400 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
401 }
402 
403 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
404 {
405 	struct sfq_sched_data *q = qdisc_priv(sch);
406 	struct tc_sfq_qopt *ctl = nla_data(opt);
407 	unsigned int qlen;
408 
409 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
410 		return -EINVAL;
411 
412 	sch_tree_lock(sch);
413 	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
414 	q->perturb_period = ctl->perturb_period * HZ;
415 	if (ctl->limit)
416 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
417 
418 	qlen = sch->q.qlen;
419 	while (sch->q.qlen > q->limit)
420 		sfq_drop(sch);
421 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
422 
423 	del_timer(&q->perturb_timer);
424 	if (q->perturb_period) {
425 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
426 		q->perturbation = net_random();
427 	}
428 	sch_tree_unlock(sch);
429 	return 0;
430 }
431 
432 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
433 {
434 	struct sfq_sched_data *q = qdisc_priv(sch);
435 	int i;
436 
437 	q->perturb_timer.function = sfq_perturbation;
438 	q->perturb_timer.data = (unsigned long)sch;
439 	init_timer_deferrable(&q->perturb_timer);
440 
441 	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
442 		q->ht[i] = SFQ_DEPTH;
443 
444 	for (i = 0; i < SFQ_DEPTH; i++) {
445 		skb_queue_head_init(&q->qs[i]);
446 		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
447 		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
448 	}
449 
450 	q->limit = SFQ_DEPTH - 1;
451 	q->max_depth = 0;
452 	q->tail = SFQ_DEPTH;
453 	if (opt == NULL) {
454 		q->quantum = psched_mtu(qdisc_dev(sch));
455 		q->perturb_period = 0;
456 		q->perturbation = net_random();
457 	} else {
458 		int err = sfq_change(sch, opt);
459 		if (err)
460 			return err;
461 	}
462 
463 	for (i = 0; i < SFQ_DEPTH; i++)
464 		sfq_link(q, i);
465 	return 0;
466 }
467 
468 static void sfq_destroy(struct Qdisc *sch)
469 {
470 	struct sfq_sched_data *q = qdisc_priv(sch);
471 
472 	tcf_destroy_chain(&q->filter_list);
473 	q->perturb_period = 0;
474 	del_timer_sync(&q->perturb_timer);
475 }
476 
477 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
478 {
479 	struct sfq_sched_data *q = qdisc_priv(sch);
480 	unsigned char *b = skb_tail_pointer(skb);
481 	struct tc_sfq_qopt opt;
482 
483 	opt.quantum = q->quantum;
484 	opt.perturb_period = q->perturb_period / HZ;
485 
486 	opt.limit = q->limit;
487 	opt.divisor = SFQ_HASH_DIVISOR;
488 	opt.flows = q->limit;
489 
490 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
491 
492 	return skb->len;
493 
494 nla_put_failure:
495 	nlmsg_trim(skb, b);
496 	return -1;
497 }
498 
499 static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
500 			    struct nlattr **tca, unsigned long *arg)
501 {
502 	return -EOPNOTSUPP;
503 }
504 
505 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
506 {
507 	return 0;
508 }
509 
510 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
511 {
512 	struct sfq_sched_data *q = qdisc_priv(sch);
513 
514 	if (cl)
515 		return NULL;
516 	return &q->filter_list;
517 }
518 
519 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
520 			  struct sk_buff *skb, struct tcmsg *tcm)
521 {
522 	tcm->tcm_handle |= TC_H_MIN(cl);
523 	return 0;
524 }
525 
526 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
527 				struct gnet_dump *d)
528 {
529 	struct sfq_sched_data *q = qdisc_priv(sch);
530 	sfq_index idx = q->ht[cl-1];
531 	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
532 	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
533 
534 	if (gnet_stats_copy_queue(d, &qs) < 0)
535 		return -1;
536 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
537 }
538 
539 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
540 {
541 	struct sfq_sched_data *q = qdisc_priv(sch);
542 	unsigned int i;
543 
544 	if (arg->stop)
545 		return;
546 
547 	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
548 		if (q->ht[i] == SFQ_DEPTH ||
549 		    arg->count < arg->skip) {
550 			arg->count++;
551 			continue;
552 		}
553 		if (arg->fn(sch, i + 1, arg) < 0) {
554 			arg->stop = 1;
555 			break;
556 		}
557 		arg->count++;
558 	}
559 }
560 
561 static const struct Qdisc_class_ops sfq_class_ops = {
562 	.get		=	sfq_get,
563 	.change		=	sfq_change_class,
564 	.tcf_chain	=	sfq_find_tcf,
565 	.dump		=	sfq_dump_class,
566 	.dump_stats	=	sfq_dump_class_stats,
567 	.walk		=	sfq_walk,
568 };
569 
570 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
571 	.cl_ops		=	&sfq_class_ops,
572 	.id		=	"sfq",
573 	.priv_size	=	sizeof(struct sfq_sched_data),
574 	.enqueue	=	sfq_enqueue,
575 	.dequeue	=	sfq_dequeue,
576 	.peek		=	sfq_peek,
577 	.drop		=	sfq_drop,
578 	.init		=	sfq_init,
579 	.reset		=	sfq_reset,
580 	.destroy	=	sfq_destroy,
581 	.change		=	NULL,
582 	.dump		=	sfq_dump,
583 	.owner		=	THIS_MODULE,
584 };
585 
586 static int __init sfq_module_init(void)
587 {
588 	return register_qdisc(&sfq_qdisc_ops);
589 }
590 static void __exit sfq_module_exit(void)
591 {
592 	unregister_qdisc(&sfq_qdisc_ops);
593 }
594 module_init(sfq_module_init)
595 module_exit(sfq_module_exit)
596 MODULE_LICENSE("GPL");
597