xref: /openbmc/linux/net/sched/sch_sfq.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/ipv6.h>
21 #include <linux/skbuff.h>
22 #include <linux/jhash.h>
23 #include <net/ip.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 
27 
28 /*	Stochastic Fairness Queuing algorithm.
29 	=======================================
30 
31 	Source:
32 	Paul E. McKenney "Stochastic Fairness Queuing",
33 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
34 
35 	Paul E. McKenney "Stochastic Fairness Queuing",
36 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
37 
38 
39 	See also:
40 	M. Shreedhar and George Varghese "Efficient Fair
41 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
42 
43 
44 	This is not the thing that is usually called (W)FQ nowadays.
45 	It does not use any timestamp mechanism, but instead
46 	processes queues in round-robin order.
47 
48 	ADVANTAGE:
49 
50 	- It is very cheap. Both CPU and memory requirements are minimal.
51 
52 	DRAWBACKS:
53 
54 	- "Stochastic" -> It is not 100% fair.
55 	When hash collisions occur, several flows are considered as one.
56 
57 	- "Round-robin" -> It introduces larger delays than virtual clock
58 	based schemes, and should not be used for isolating interactive
59 	traffic	from non-interactive. It means, that this scheduler
60 	should be used as leaf of CBQ or P3, which put interactive traffic
61 	to higher priority band.
62 
63 	We still need true WFQ for top level CSZ, but using WFQ
64 	for the best effort traffic is absolutely pointless:
65 	SFQ is superior for this purpose.
66 
67 	IMPLEMENTATION:
68 	This implementation limits maximal queue length to 128;
69 	maximal mtu to 2^15-1; number of hash buckets to 1024.
70 	The only goal of this restrictions was that all data
71 	fit into one 4K page :-). Struct sfq_sched_data is
72 	organized in anti-cache manner: all the data for a bucket
73 	are scattered over different locations. This is not good,
74 	but it allowed me to put it into 4K.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_DEPTH		128
79 #define SFQ_HASH_DIVISOR	1024
80 
81 /* This type should contain at least SFQ_DEPTH*2 values */
82 typedef unsigned char sfq_index;
83 
84 struct sfq_head
85 {
86 	sfq_index	next;
87 	sfq_index	prev;
88 };
89 
90 struct sfq_sched_data
91 {
92 /* Parameters */
93 	int		perturb_period;
94 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
95 	int		limit;
96 
97 /* Variables */
98 	struct timer_list perturb_timer;
99 	u32		perturbation;
100 	sfq_index	tail;		/* Index of current slot in round */
101 	sfq_index	max_depth;	/* Maximal depth */
102 
103 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
104 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
105 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
106 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
107 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
108 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
109 };
110 
111 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
112 {
113 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
114 }
115 
116 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
117 {
118 	u32 h, h2;
119 
120 	switch (skb->protocol) {
121 	case __constant_htons(ETH_P_IP):
122 	{
123 		const struct iphdr *iph = ip_hdr(skb);
124 		h = iph->daddr;
125 		h2 = iph->saddr^iph->protocol;
126 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
127 		    (iph->protocol == IPPROTO_TCP ||
128 		     iph->protocol == IPPROTO_UDP ||
129 		     iph->protocol == IPPROTO_UDPLITE ||
130 		     iph->protocol == IPPROTO_SCTP ||
131 		     iph->protocol == IPPROTO_DCCP ||
132 		     iph->protocol == IPPROTO_ESP))
133 			h2 ^= *(((u32*)iph) + iph->ihl);
134 		break;
135 	}
136 	case __constant_htons(ETH_P_IPV6):
137 	{
138 		struct ipv6hdr *iph = ipv6_hdr(skb);
139 		h = iph->daddr.s6_addr32[3];
140 		h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
141 		if (iph->nexthdr == IPPROTO_TCP ||
142 		    iph->nexthdr == IPPROTO_UDP ||
143 		    iph->nexthdr == IPPROTO_UDPLITE ||
144 		    iph->nexthdr == IPPROTO_SCTP ||
145 		    iph->nexthdr == IPPROTO_DCCP ||
146 		    iph->nexthdr == IPPROTO_ESP)
147 			h2 ^= *(u32*)&iph[1];
148 		break;
149 	}
150 	default:
151 		h = (u32)(unsigned long)skb->dst^skb->protocol;
152 		h2 = (u32)(unsigned long)skb->sk;
153 	}
154 	return sfq_fold_hash(q, h, h2);
155 }
156 
157 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
158 {
159 	sfq_index p, n;
160 	int d = q->qs[x].qlen + SFQ_DEPTH;
161 
162 	p = d;
163 	n = q->dep[d].next;
164 	q->dep[x].next = n;
165 	q->dep[x].prev = p;
166 	q->dep[p].next = q->dep[n].prev = x;
167 }
168 
169 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
170 {
171 	sfq_index p, n;
172 
173 	n = q->dep[x].next;
174 	p = q->dep[x].prev;
175 	q->dep[p].next = n;
176 	q->dep[n].prev = p;
177 
178 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
179 		q->max_depth--;
180 
181 	sfq_link(q, x);
182 }
183 
184 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
185 {
186 	sfq_index p, n;
187 	int d;
188 
189 	n = q->dep[x].next;
190 	p = q->dep[x].prev;
191 	q->dep[p].next = n;
192 	q->dep[n].prev = p;
193 	d = q->qs[x].qlen;
194 	if (q->max_depth < d)
195 		q->max_depth = d;
196 
197 	sfq_link(q, x);
198 }
199 
200 static unsigned int sfq_drop(struct Qdisc *sch)
201 {
202 	struct sfq_sched_data *q = qdisc_priv(sch);
203 	sfq_index d = q->max_depth;
204 	struct sk_buff *skb;
205 	unsigned int len;
206 
207 	/* Queue is full! Find the longest slot and
208 	   drop a packet from it */
209 
210 	if (d > 1) {
211 		sfq_index x = q->dep[d+SFQ_DEPTH].next;
212 		skb = q->qs[x].prev;
213 		len = skb->len;
214 		__skb_unlink(skb, &q->qs[x]);
215 		kfree_skb(skb);
216 		sfq_dec(q, x);
217 		sch->q.qlen--;
218 		sch->qstats.drops++;
219 		sch->qstats.backlog -= len;
220 		return len;
221 	}
222 
223 	if (d == 1) {
224 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
225 		d = q->next[q->tail];
226 		q->next[q->tail] = q->next[d];
227 		q->allot[q->next[d]] += q->quantum;
228 		skb = q->qs[d].prev;
229 		len = skb->len;
230 		__skb_unlink(skb, &q->qs[d]);
231 		kfree_skb(skb);
232 		sfq_dec(q, d);
233 		sch->q.qlen--;
234 		q->ht[q->hash[d]] = SFQ_DEPTH;
235 		sch->qstats.drops++;
236 		sch->qstats.backlog -= len;
237 		return len;
238 	}
239 
240 	return 0;
241 }
242 
243 static int
244 sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
245 {
246 	struct sfq_sched_data *q = qdisc_priv(sch);
247 	unsigned hash = sfq_hash(q, skb);
248 	sfq_index x;
249 
250 	x = q->ht[hash];
251 	if (x == SFQ_DEPTH) {
252 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
253 		q->hash[x] = hash;
254 	}
255 	/* If selected queue has length q->limit, this means that
256 	 * all another queues are empty and that we do simple tail drop,
257 	 * i.e. drop _this_ packet.
258 	 */
259 	if (q->qs[x].qlen >= q->limit)
260 		return qdisc_drop(skb, sch);
261 
262 	sch->qstats.backlog += skb->len;
263 	__skb_queue_tail(&q->qs[x], skb);
264 	sfq_inc(q, x);
265 	if (q->qs[x].qlen == 1) {		/* The flow is new */
266 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
267 			q->tail = x;
268 			q->next[x] = x;
269 			q->allot[x] = q->quantum;
270 		} else {
271 			q->next[x] = q->next[q->tail];
272 			q->next[q->tail] = x;
273 			q->tail = x;
274 		}
275 	}
276 	if (++sch->q.qlen <= q->limit) {
277 		sch->bstats.bytes += skb->len;
278 		sch->bstats.packets++;
279 		return 0;
280 	}
281 
282 	sfq_drop(sch);
283 	return NET_XMIT_CN;
284 }
285 
286 static int
287 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
288 {
289 	struct sfq_sched_data *q = qdisc_priv(sch);
290 	unsigned hash = sfq_hash(q, skb);
291 	sfq_index x;
292 
293 	x = q->ht[hash];
294 	if (x == SFQ_DEPTH) {
295 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
296 		q->hash[x] = hash;
297 	}
298 	sch->qstats.backlog += skb->len;
299 	__skb_queue_head(&q->qs[x], skb);
300 	/* If selected queue has length q->limit+1, this means that
301 	 * all another queues are empty and we do simple tail drop.
302 	 * This packet is still requeued at head of queue, tail packet
303 	 * is dropped.
304 	 */
305 	if (q->qs[x].qlen > q->limit) {
306 		skb = q->qs[x].prev;
307 		__skb_unlink(skb, &q->qs[x]);
308 		sch->qstats.drops++;
309 		sch->qstats.backlog -= skb->len;
310 		kfree_skb(skb);
311 		return NET_XMIT_CN;
312 	}
313 	sfq_inc(q, x);
314 	if (q->qs[x].qlen == 1) {		/* The flow is new */
315 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
316 			q->tail = x;
317 			q->next[x] = x;
318 			q->allot[x] = q->quantum;
319 		} else {
320 			q->next[x] = q->next[q->tail];
321 			q->next[q->tail] = x;
322 			q->tail = x;
323 		}
324 	}
325 	if (++sch->q.qlen <= q->limit) {
326 		sch->qstats.requeues++;
327 		return 0;
328 	}
329 
330 	sch->qstats.drops++;
331 	sfq_drop(sch);
332 	return NET_XMIT_CN;
333 }
334 
335 
336 
337 
338 static struct sk_buff *
339 sfq_dequeue(struct Qdisc* sch)
340 {
341 	struct sfq_sched_data *q = qdisc_priv(sch);
342 	struct sk_buff *skb;
343 	sfq_index a, old_a;
344 
345 	/* No active slots */
346 	if (q->tail == SFQ_DEPTH)
347 		return NULL;
348 
349 	a = old_a = q->next[q->tail];
350 
351 	/* Grab packet */
352 	skb = __skb_dequeue(&q->qs[a]);
353 	sfq_dec(q, a);
354 	sch->q.qlen--;
355 	sch->qstats.backlog -= skb->len;
356 
357 	/* Is the slot empty? */
358 	if (q->qs[a].qlen == 0) {
359 		q->ht[q->hash[a]] = SFQ_DEPTH;
360 		a = q->next[a];
361 		if (a == old_a) {
362 			q->tail = SFQ_DEPTH;
363 			return skb;
364 		}
365 		q->next[q->tail] = a;
366 		q->allot[a] += q->quantum;
367 	} else if ((q->allot[a] -= skb->len) <= 0) {
368 		q->tail = a;
369 		a = q->next[a];
370 		q->allot[a] += q->quantum;
371 	}
372 	return skb;
373 }
374 
375 static void
376 sfq_reset(struct Qdisc* sch)
377 {
378 	struct sk_buff *skb;
379 
380 	while ((skb = sfq_dequeue(sch)) != NULL)
381 		kfree_skb(skb);
382 }
383 
384 static void sfq_perturbation(unsigned long arg)
385 {
386 	struct Qdisc *sch = (struct Qdisc*)arg;
387 	struct sfq_sched_data *q = qdisc_priv(sch);
388 
389 	get_random_bytes(&q->perturbation, 4);
390 
391 	if (q->perturb_period)
392 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
393 }
394 
395 static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
396 {
397 	struct sfq_sched_data *q = qdisc_priv(sch);
398 	struct tc_sfq_qopt *ctl = RTA_DATA(opt);
399 	unsigned int qlen;
400 
401 	if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
402 		return -EINVAL;
403 
404 	sch_tree_lock(sch);
405 	q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
406 	q->perturb_period = ctl->perturb_period*HZ;
407 	if (ctl->limit)
408 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
409 
410 	qlen = sch->q.qlen;
411 	while (sch->q.qlen > q->limit)
412 		sfq_drop(sch);
413 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
414 
415 	del_timer(&q->perturb_timer);
416 	if (q->perturb_period) {
417 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
418 		get_random_bytes(&q->perturbation, 4);
419 	}
420 	sch_tree_unlock(sch);
421 	return 0;
422 }
423 
424 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
425 {
426 	struct sfq_sched_data *q = qdisc_priv(sch);
427 	int i;
428 
429 	init_timer(&q->perturb_timer);
430 	q->perturb_timer.data = (unsigned long)sch;
431 	q->perturb_timer.function = sfq_perturbation;
432 
433 	for (i=0; i<SFQ_HASH_DIVISOR; i++)
434 		q->ht[i] = SFQ_DEPTH;
435 	for (i=0; i<SFQ_DEPTH; i++) {
436 		skb_queue_head_init(&q->qs[i]);
437 		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
438 		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
439 	}
440 	q->limit = SFQ_DEPTH - 1;
441 	q->max_depth = 0;
442 	q->tail = SFQ_DEPTH;
443 	if (opt == NULL) {
444 		q->quantum = psched_mtu(sch->dev);
445 		q->perturb_period = 0;
446 		get_random_bytes(&q->perturbation, 4);
447 	} else {
448 		int err = sfq_change(sch, opt);
449 		if (err)
450 			return err;
451 	}
452 	for (i=0; i<SFQ_DEPTH; i++)
453 		sfq_link(q, i);
454 	return 0;
455 }
456 
457 static void sfq_destroy(struct Qdisc *sch)
458 {
459 	struct sfq_sched_data *q = qdisc_priv(sch);
460 	del_timer(&q->perturb_timer);
461 }
462 
463 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
464 {
465 	struct sfq_sched_data *q = qdisc_priv(sch);
466 	unsigned char *b = skb_tail_pointer(skb);
467 	struct tc_sfq_qopt opt;
468 
469 	opt.quantum = q->quantum;
470 	opt.perturb_period = q->perturb_period/HZ;
471 
472 	opt.limit = q->limit;
473 	opt.divisor = SFQ_HASH_DIVISOR;
474 	opt.flows = q->limit;
475 
476 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
477 
478 	return skb->len;
479 
480 rtattr_failure:
481 	nlmsg_trim(skb, b);
482 	return -1;
483 }
484 
485 static struct Qdisc_ops sfq_qdisc_ops = {
486 	.next		=	NULL,
487 	.cl_ops		=	NULL,
488 	.id		=	"sfq",
489 	.priv_size	=	sizeof(struct sfq_sched_data),
490 	.enqueue	=	sfq_enqueue,
491 	.dequeue	=	sfq_dequeue,
492 	.requeue	=	sfq_requeue,
493 	.drop		=	sfq_drop,
494 	.init		=	sfq_init,
495 	.reset		=	sfq_reset,
496 	.destroy	=	sfq_destroy,
497 	.change		=	NULL,
498 	.dump		=	sfq_dump,
499 	.owner		=	THIS_MODULE,
500 };
501 
502 static int __init sfq_module_init(void)
503 {
504 	return register_qdisc(&sfq_qdisc_ops);
505 }
506 static void __exit sfq_module_exit(void)
507 {
508 	unregister_qdisc(&sfq_qdisc_ops);
509 }
510 module_init(sfq_module_init)
511 module_exit(sfq_module_exit)
512 MODULE_LICENSE("GPL");
513