xref: /openbmc/linux/net/sched/sch_sfq.c (revision d2999e1b)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto *filter_list;
129 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131 
132 	struct red_parms *red_parms;
133 	struct tc_sfqred_stats stats;
134 	struct sfq_slot *tail;		/* current slot in round */
135 
136 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137 					/* Linked lists of slots, indexed by depth
138 					 * dep[0] : list of unused flows
139 					 * dep[1] : list of flows with 1 packet
140 					 * dep[X] : list of flows with X packets
141 					 */
142 
143 	unsigned int	maxflows;	/* number of flows in flows array */
144 	int		perturb_period;
145 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146 	struct timer_list perturb_timer;
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154 	if (val < SFQ_MAX_FLOWS)
155 		return &q->slots[val].dep;
156 	return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 /*
160  * In order to be able to quickly rehash our queue when timer changes
161  * q->perturbation, we store flow_keys in skb->cb[]
162  */
163 struct sfq_skb_cb {
164        struct flow_keys        keys;
165 };
166 
167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168 {
169 	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170 	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
174 			     const struct sk_buff *skb)
175 {
176 	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177 	unsigned int hash;
178 
179 	hash = jhash_3words((__force u32)keys->dst,
180 			    (__force u32)keys->src ^ keys->ip_proto,
181 			    (__force u32)keys->ports, q->perturbation);
182 	return hash & (q->divisor - 1);
183 }
184 
185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186 				 int *qerr)
187 {
188 	struct sfq_sched_data *q = qdisc_priv(sch);
189 	struct tcf_result res;
190 	int result;
191 
192 	if (TC_H_MAJ(skb->priority) == sch->handle &&
193 	    TC_H_MIN(skb->priority) > 0 &&
194 	    TC_H_MIN(skb->priority) <= q->divisor)
195 		return TC_H_MIN(skb->priority);
196 
197 	if (!q->filter_list) {
198 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
199 		return sfq_hash(q, skb) + 1;
200 	}
201 
202 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
203 	result = tc_classify(skb, q->filter_list, &res);
204 	if (result >= 0) {
205 #ifdef CONFIG_NET_CLS_ACT
206 		switch (result) {
207 		case TC_ACT_STOLEN:
208 		case TC_ACT_QUEUED:
209 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
210 		case TC_ACT_SHOT:
211 			return 0;
212 		}
213 #endif
214 		if (TC_H_MIN(res.classid) <= q->divisor)
215 			return TC_H_MIN(res.classid);
216 	}
217 	return 0;
218 }
219 
220 /*
221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
222  */
223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
224 {
225 	sfq_index p, n;
226 	struct sfq_slot *slot = &q->slots[x];
227 	int qlen = slot->qlen;
228 
229 	p = qlen + SFQ_MAX_FLOWS;
230 	n = q->dep[qlen].next;
231 
232 	slot->dep.next = n;
233 	slot->dep.prev = p;
234 
235 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
236 	sfq_dep_head(q, n)->prev = x;
237 }
238 
239 #define sfq_unlink(q, x, n, p)			\
240 	do {					\
241 		n = q->slots[x].dep.next;	\
242 		p = q->slots[x].dep.prev;	\
243 		sfq_dep_head(q, p)->next = n;	\
244 		sfq_dep_head(q, n)->prev = p;	\
245 	} while (0)
246 
247 
248 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
249 {
250 	sfq_index p, n;
251 	int d;
252 
253 	sfq_unlink(q, x, n, p);
254 
255 	d = q->slots[x].qlen--;
256 	if (n == p && q->cur_depth == d)
257 		q->cur_depth--;
258 	sfq_link(q, x);
259 }
260 
261 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
262 {
263 	sfq_index p, n;
264 	int d;
265 
266 	sfq_unlink(q, x, n, p);
267 
268 	d = ++q->slots[x].qlen;
269 	if (q->cur_depth < d)
270 		q->cur_depth = d;
271 	sfq_link(q, x);
272 }
273 
274 /* helper functions : might be changed when/if skb use a standard list_head */
275 
276 /* remove one skb from tail of slot queue */
277 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
278 {
279 	struct sk_buff *skb = slot->skblist_prev;
280 
281 	slot->skblist_prev = skb->prev;
282 	skb->prev->next = (struct sk_buff *)slot;
283 	skb->next = skb->prev = NULL;
284 	return skb;
285 }
286 
287 /* remove one skb from head of slot queue */
288 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
289 {
290 	struct sk_buff *skb = slot->skblist_next;
291 
292 	slot->skblist_next = skb->next;
293 	skb->next->prev = (struct sk_buff *)slot;
294 	skb->next = skb->prev = NULL;
295 	return skb;
296 }
297 
298 static inline void slot_queue_init(struct sfq_slot *slot)
299 {
300 	memset(slot, 0, sizeof(*slot));
301 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
302 }
303 
304 /* add skb to slot queue (tail add) */
305 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
306 {
307 	skb->prev = slot->skblist_prev;
308 	skb->next = (struct sk_buff *)slot;
309 	slot->skblist_prev->next = skb;
310 	slot->skblist_prev = skb;
311 }
312 
313 #define	slot_queue_walk(slot, skb)		\
314 	for (skb = slot->skblist_next;		\
315 	     skb != (struct sk_buff *)slot;	\
316 	     skb = skb->next)
317 
318 static unsigned int sfq_drop(struct Qdisc *sch)
319 {
320 	struct sfq_sched_data *q = qdisc_priv(sch);
321 	sfq_index x, d = q->cur_depth;
322 	struct sk_buff *skb;
323 	unsigned int len;
324 	struct sfq_slot *slot;
325 
326 	/* Queue is full! Find the longest slot and drop tail packet from it */
327 	if (d > 1) {
328 		x = q->dep[d].next;
329 		slot = &q->slots[x];
330 drop:
331 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
332 		len = qdisc_pkt_len(skb);
333 		slot->backlog -= len;
334 		sfq_dec(q, x);
335 		kfree_skb(skb);
336 		sch->q.qlen--;
337 		sch->qstats.drops++;
338 		sch->qstats.backlog -= len;
339 		return len;
340 	}
341 
342 	if (d == 1) {
343 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
344 		x = q->tail->next;
345 		slot = &q->slots[x];
346 		q->tail->next = slot->next;
347 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
348 		goto drop;
349 	}
350 
351 	return 0;
352 }
353 
354 /* Is ECN parameter configured */
355 static int sfq_prob_mark(const struct sfq_sched_data *q)
356 {
357 	return q->flags & TC_RED_ECN;
358 }
359 
360 /* Should packets over max threshold just be marked */
361 static int sfq_hard_mark(const struct sfq_sched_data *q)
362 {
363 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
364 }
365 
366 static int sfq_headdrop(const struct sfq_sched_data *q)
367 {
368 	return q->headdrop;
369 }
370 
371 static int
372 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
373 {
374 	struct sfq_sched_data *q = qdisc_priv(sch);
375 	unsigned int hash;
376 	sfq_index x, qlen;
377 	struct sfq_slot *slot;
378 	int uninitialized_var(ret);
379 	struct sk_buff *head;
380 	int delta;
381 
382 	hash = sfq_classify(skb, sch, &ret);
383 	if (hash == 0) {
384 		if (ret & __NET_XMIT_BYPASS)
385 			sch->qstats.drops++;
386 		kfree_skb(skb);
387 		return ret;
388 	}
389 	hash--;
390 
391 	x = q->ht[hash];
392 	slot = &q->slots[x];
393 	if (x == SFQ_EMPTY_SLOT) {
394 		x = q->dep[0].next; /* get a free slot */
395 		if (x >= SFQ_MAX_FLOWS)
396 			return qdisc_drop(skb, sch);
397 		q->ht[hash] = x;
398 		slot = &q->slots[x];
399 		slot->hash = hash;
400 		slot->backlog = 0; /* should already be 0 anyway... */
401 		red_set_vars(&slot->vars);
402 		goto enqueue;
403 	}
404 	if (q->red_parms) {
405 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
406 							&slot->vars,
407 							slot->backlog);
408 		switch (red_action(q->red_parms,
409 				   &slot->vars,
410 				   slot->vars.qavg)) {
411 		case RED_DONT_MARK:
412 			break;
413 
414 		case RED_PROB_MARK:
415 			sch->qstats.overlimits++;
416 			if (sfq_prob_mark(q)) {
417 				/* We know we have at least one packet in queue */
418 				if (sfq_headdrop(q) &&
419 				    INET_ECN_set_ce(slot->skblist_next)) {
420 					q->stats.prob_mark_head++;
421 					break;
422 				}
423 				if (INET_ECN_set_ce(skb)) {
424 					q->stats.prob_mark++;
425 					break;
426 				}
427 			}
428 			q->stats.prob_drop++;
429 			goto congestion_drop;
430 
431 		case RED_HARD_MARK:
432 			sch->qstats.overlimits++;
433 			if (sfq_hard_mark(q)) {
434 				/* We know we have at least one packet in queue */
435 				if (sfq_headdrop(q) &&
436 				    INET_ECN_set_ce(slot->skblist_next)) {
437 					q->stats.forced_mark_head++;
438 					break;
439 				}
440 				if (INET_ECN_set_ce(skb)) {
441 					q->stats.forced_mark++;
442 					break;
443 				}
444 			}
445 			q->stats.forced_drop++;
446 			goto congestion_drop;
447 		}
448 	}
449 
450 	if (slot->qlen >= q->maxdepth) {
451 congestion_drop:
452 		if (!sfq_headdrop(q))
453 			return qdisc_drop(skb, sch);
454 
455 		/* We know we have at least one packet in queue */
456 		head = slot_dequeue_head(slot);
457 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
458 		sch->qstats.backlog -= delta;
459 		slot->backlog -= delta;
460 		qdisc_drop(head, sch);
461 
462 		slot_queue_add(slot, skb);
463 		return NET_XMIT_CN;
464 	}
465 
466 enqueue:
467 	sch->qstats.backlog += qdisc_pkt_len(skb);
468 	slot->backlog += qdisc_pkt_len(skb);
469 	slot_queue_add(slot, skb);
470 	sfq_inc(q, x);
471 	if (slot->qlen == 1) {		/* The flow is new */
472 		if (q->tail == NULL) {	/* It is the first flow */
473 			slot->next = x;
474 		} else {
475 			slot->next = q->tail->next;
476 			q->tail->next = x;
477 		}
478 		/* We put this flow at the end of our flow list.
479 		 * This might sound unfair for a new flow to wait after old ones,
480 		 * but we could endup servicing new flows only, and freeze old ones.
481 		 */
482 		q->tail = slot;
483 		/* We could use a bigger initial quantum for new flows */
484 		slot->allot = q->scaled_quantum;
485 	}
486 	if (++sch->q.qlen <= q->limit)
487 		return NET_XMIT_SUCCESS;
488 
489 	qlen = slot->qlen;
490 	sfq_drop(sch);
491 	/* Return Congestion Notification only if we dropped a packet
492 	 * from this flow.
493 	 */
494 	if (qlen != slot->qlen)
495 		return NET_XMIT_CN;
496 
497 	/* As we dropped a packet, better let upper stack know this */
498 	qdisc_tree_decrease_qlen(sch, 1);
499 	return NET_XMIT_SUCCESS;
500 }
501 
502 static struct sk_buff *
503 sfq_dequeue(struct Qdisc *sch)
504 {
505 	struct sfq_sched_data *q = qdisc_priv(sch);
506 	struct sk_buff *skb;
507 	sfq_index a, next_a;
508 	struct sfq_slot *slot;
509 
510 	/* No active slots */
511 	if (q->tail == NULL)
512 		return NULL;
513 
514 next_slot:
515 	a = q->tail->next;
516 	slot = &q->slots[a];
517 	if (slot->allot <= 0) {
518 		q->tail = slot;
519 		slot->allot += q->scaled_quantum;
520 		goto next_slot;
521 	}
522 	skb = slot_dequeue_head(slot);
523 	sfq_dec(q, a);
524 	qdisc_bstats_update(sch, skb);
525 	sch->q.qlen--;
526 	sch->qstats.backlog -= qdisc_pkt_len(skb);
527 	slot->backlog -= qdisc_pkt_len(skb);
528 	/* Is the slot empty? */
529 	if (slot->qlen == 0) {
530 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
531 		next_a = slot->next;
532 		if (a == next_a) {
533 			q->tail = NULL; /* no more active slots */
534 			return skb;
535 		}
536 		q->tail->next = next_a;
537 	} else {
538 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
539 	}
540 	return skb;
541 }
542 
543 static void
544 sfq_reset(struct Qdisc *sch)
545 {
546 	struct sk_buff *skb;
547 
548 	while ((skb = sfq_dequeue(sch)) != NULL)
549 		kfree_skb(skb);
550 }
551 
552 /*
553  * When q->perturbation is changed, we rehash all queued skbs
554  * to avoid OOO (Out Of Order) effects.
555  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
556  * counters.
557  */
558 static void sfq_rehash(struct Qdisc *sch)
559 {
560 	struct sfq_sched_data *q = qdisc_priv(sch);
561 	struct sk_buff *skb;
562 	int i;
563 	struct sfq_slot *slot;
564 	struct sk_buff_head list;
565 	int dropped = 0;
566 
567 	__skb_queue_head_init(&list);
568 
569 	for (i = 0; i < q->maxflows; i++) {
570 		slot = &q->slots[i];
571 		if (!slot->qlen)
572 			continue;
573 		while (slot->qlen) {
574 			skb = slot_dequeue_head(slot);
575 			sfq_dec(q, i);
576 			__skb_queue_tail(&list, skb);
577 		}
578 		slot->backlog = 0;
579 		red_set_vars(&slot->vars);
580 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
581 	}
582 	q->tail = NULL;
583 
584 	while ((skb = __skb_dequeue(&list)) != NULL) {
585 		unsigned int hash = sfq_hash(q, skb);
586 		sfq_index x = q->ht[hash];
587 
588 		slot = &q->slots[x];
589 		if (x == SFQ_EMPTY_SLOT) {
590 			x = q->dep[0].next; /* get a free slot */
591 			if (x >= SFQ_MAX_FLOWS) {
592 drop:				sch->qstats.backlog -= qdisc_pkt_len(skb);
593 				kfree_skb(skb);
594 				dropped++;
595 				continue;
596 			}
597 			q->ht[hash] = x;
598 			slot = &q->slots[x];
599 			slot->hash = hash;
600 		}
601 		if (slot->qlen >= q->maxdepth)
602 			goto drop;
603 		slot_queue_add(slot, skb);
604 		if (q->red_parms)
605 			slot->vars.qavg = red_calc_qavg(q->red_parms,
606 							&slot->vars,
607 							slot->backlog);
608 		slot->backlog += qdisc_pkt_len(skb);
609 		sfq_inc(q, x);
610 		if (slot->qlen == 1) {		/* The flow is new */
611 			if (q->tail == NULL) {	/* It is the first flow */
612 				slot->next = x;
613 			} else {
614 				slot->next = q->tail->next;
615 				q->tail->next = x;
616 			}
617 			q->tail = slot;
618 			slot->allot = q->scaled_quantum;
619 		}
620 	}
621 	sch->q.qlen -= dropped;
622 	qdisc_tree_decrease_qlen(sch, dropped);
623 }
624 
625 static void sfq_perturbation(unsigned long arg)
626 {
627 	struct Qdisc *sch = (struct Qdisc *)arg;
628 	struct sfq_sched_data *q = qdisc_priv(sch);
629 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
630 
631 	spin_lock(root_lock);
632 	q->perturbation = prandom_u32();
633 	if (!q->filter_list && q->tail)
634 		sfq_rehash(sch);
635 	spin_unlock(root_lock);
636 
637 	if (q->perturb_period)
638 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
639 }
640 
641 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
642 {
643 	struct sfq_sched_data *q = qdisc_priv(sch);
644 	struct tc_sfq_qopt *ctl = nla_data(opt);
645 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
646 	unsigned int qlen;
647 	struct red_parms *p = NULL;
648 
649 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650 		return -EINVAL;
651 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652 		ctl_v1 = nla_data(opt);
653 	if (ctl->divisor &&
654 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655 		return -EINVAL;
656 	if (ctl_v1 && ctl_v1->qth_min) {
657 		p = kmalloc(sizeof(*p), GFP_KERNEL);
658 		if (!p)
659 			return -ENOMEM;
660 	}
661 	sch_tree_lock(sch);
662 	if (ctl->quantum) {
663 		q->quantum = ctl->quantum;
664 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
665 	}
666 	q->perturb_period = ctl->perturb_period * HZ;
667 	if (ctl->flows)
668 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
669 	if (ctl->divisor) {
670 		q->divisor = ctl->divisor;
671 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
672 	}
673 	if (ctl_v1) {
674 		if (ctl_v1->depth)
675 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
676 		if (p) {
677 			swap(q->red_parms, p);
678 			red_set_parms(q->red_parms,
679 				      ctl_v1->qth_min, ctl_v1->qth_max,
680 				      ctl_v1->Wlog,
681 				      ctl_v1->Plog, ctl_v1->Scell_log,
682 				      NULL,
683 				      ctl_v1->max_P);
684 		}
685 		q->flags = ctl_v1->flags;
686 		q->headdrop = ctl_v1->headdrop;
687 	}
688 	if (ctl->limit) {
689 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
690 		q->maxflows = min_t(u32, q->maxflows, q->limit);
691 	}
692 
693 	qlen = sch->q.qlen;
694 	while (sch->q.qlen > q->limit)
695 		sfq_drop(sch);
696 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
697 
698 	del_timer(&q->perturb_timer);
699 	if (q->perturb_period) {
700 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
701 		q->perturbation = prandom_u32();
702 	}
703 	sch_tree_unlock(sch);
704 	kfree(p);
705 	return 0;
706 }
707 
708 static void *sfq_alloc(size_t sz)
709 {
710 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
711 
712 	if (!ptr)
713 		ptr = vmalloc(sz);
714 	return ptr;
715 }
716 
717 static void sfq_free(void *addr)
718 {
719 	kvfree(addr);
720 }
721 
722 static void sfq_destroy(struct Qdisc *sch)
723 {
724 	struct sfq_sched_data *q = qdisc_priv(sch);
725 
726 	tcf_destroy_chain(&q->filter_list);
727 	q->perturb_period = 0;
728 	del_timer_sync(&q->perturb_timer);
729 	sfq_free(q->ht);
730 	sfq_free(q->slots);
731 	kfree(q->red_parms);
732 }
733 
734 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
735 {
736 	struct sfq_sched_data *q = qdisc_priv(sch);
737 	int i;
738 
739 	q->perturb_timer.function = sfq_perturbation;
740 	q->perturb_timer.data = (unsigned long)sch;
741 	init_timer_deferrable(&q->perturb_timer);
742 
743 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
744 		q->dep[i].next = i + SFQ_MAX_FLOWS;
745 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
746 	}
747 
748 	q->limit = SFQ_MAX_DEPTH;
749 	q->maxdepth = SFQ_MAX_DEPTH;
750 	q->cur_depth = 0;
751 	q->tail = NULL;
752 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
753 	q->maxflows = SFQ_DEFAULT_FLOWS;
754 	q->quantum = psched_mtu(qdisc_dev(sch));
755 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
756 	q->perturb_period = 0;
757 	q->perturbation = prandom_u32();
758 
759 	if (opt) {
760 		int err = sfq_change(sch, opt);
761 		if (err)
762 			return err;
763 	}
764 
765 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
766 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
767 	if (!q->ht || !q->slots) {
768 		sfq_destroy(sch);
769 		return -ENOMEM;
770 	}
771 	for (i = 0; i < q->divisor; i++)
772 		q->ht[i] = SFQ_EMPTY_SLOT;
773 
774 	for (i = 0; i < q->maxflows; i++) {
775 		slot_queue_init(&q->slots[i]);
776 		sfq_link(q, i);
777 	}
778 	if (q->limit >= 1)
779 		sch->flags |= TCQ_F_CAN_BYPASS;
780 	else
781 		sch->flags &= ~TCQ_F_CAN_BYPASS;
782 	return 0;
783 }
784 
785 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
786 {
787 	struct sfq_sched_data *q = qdisc_priv(sch);
788 	unsigned char *b = skb_tail_pointer(skb);
789 	struct tc_sfq_qopt_v1 opt;
790 	struct red_parms *p = q->red_parms;
791 
792 	memset(&opt, 0, sizeof(opt));
793 	opt.v0.quantum	= q->quantum;
794 	opt.v0.perturb_period = q->perturb_period / HZ;
795 	opt.v0.limit	= q->limit;
796 	opt.v0.divisor	= q->divisor;
797 	opt.v0.flows	= q->maxflows;
798 	opt.depth	= q->maxdepth;
799 	opt.headdrop	= q->headdrop;
800 
801 	if (p) {
802 		opt.qth_min	= p->qth_min >> p->Wlog;
803 		opt.qth_max	= p->qth_max >> p->Wlog;
804 		opt.Wlog	= p->Wlog;
805 		opt.Plog	= p->Plog;
806 		opt.Scell_log	= p->Scell_log;
807 		opt.max_P	= p->max_P;
808 	}
809 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
810 	opt.flags	= q->flags;
811 
812 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
813 		goto nla_put_failure;
814 
815 	return skb->len;
816 
817 nla_put_failure:
818 	nlmsg_trim(skb, b);
819 	return -1;
820 }
821 
822 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
823 {
824 	return NULL;
825 }
826 
827 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
828 {
829 	return 0;
830 }
831 
832 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
833 			      u32 classid)
834 {
835 	/* we cannot bypass queue discipline anymore */
836 	sch->flags &= ~TCQ_F_CAN_BYPASS;
837 	return 0;
838 }
839 
840 static void sfq_put(struct Qdisc *q, unsigned long cl)
841 {
842 }
843 
844 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
845 {
846 	struct sfq_sched_data *q = qdisc_priv(sch);
847 
848 	if (cl)
849 		return NULL;
850 	return &q->filter_list;
851 }
852 
853 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
854 			  struct sk_buff *skb, struct tcmsg *tcm)
855 {
856 	tcm->tcm_handle |= TC_H_MIN(cl);
857 	return 0;
858 }
859 
860 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
861 				struct gnet_dump *d)
862 {
863 	struct sfq_sched_data *q = qdisc_priv(sch);
864 	sfq_index idx = q->ht[cl - 1];
865 	struct gnet_stats_queue qs = { 0 };
866 	struct tc_sfq_xstats xstats = { 0 };
867 
868 	if (idx != SFQ_EMPTY_SLOT) {
869 		const struct sfq_slot *slot = &q->slots[idx];
870 
871 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
872 		qs.qlen = slot->qlen;
873 		qs.backlog = slot->backlog;
874 	}
875 	if (gnet_stats_copy_queue(d, &qs) < 0)
876 		return -1;
877 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
878 }
879 
880 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
881 {
882 	struct sfq_sched_data *q = qdisc_priv(sch);
883 	unsigned int i;
884 
885 	if (arg->stop)
886 		return;
887 
888 	for (i = 0; i < q->divisor; i++) {
889 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
890 		    arg->count < arg->skip) {
891 			arg->count++;
892 			continue;
893 		}
894 		if (arg->fn(sch, i + 1, arg) < 0) {
895 			arg->stop = 1;
896 			break;
897 		}
898 		arg->count++;
899 	}
900 }
901 
902 static const struct Qdisc_class_ops sfq_class_ops = {
903 	.leaf		=	sfq_leaf,
904 	.get		=	sfq_get,
905 	.put		=	sfq_put,
906 	.tcf_chain	=	sfq_find_tcf,
907 	.bind_tcf	=	sfq_bind,
908 	.unbind_tcf	=	sfq_put,
909 	.dump		=	sfq_dump_class,
910 	.dump_stats	=	sfq_dump_class_stats,
911 	.walk		=	sfq_walk,
912 };
913 
914 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
915 	.cl_ops		=	&sfq_class_ops,
916 	.id		=	"sfq",
917 	.priv_size	=	sizeof(struct sfq_sched_data),
918 	.enqueue	=	sfq_enqueue,
919 	.dequeue	=	sfq_dequeue,
920 	.peek		=	qdisc_peek_dequeued,
921 	.drop		=	sfq_drop,
922 	.init		=	sfq_init,
923 	.reset		=	sfq_reset,
924 	.destroy	=	sfq_destroy,
925 	.change		=	NULL,
926 	.dump		=	sfq_dump,
927 	.owner		=	THIS_MODULE,
928 };
929 
930 static int __init sfq_module_init(void)
931 {
932 	return register_qdisc(&sfq_qdisc_ops);
933 }
934 static void __exit sfq_module_exit(void)
935 {
936 	unregister_qdisc(&sfq_qdisc_ops);
937 }
938 module_init(sfq_module_init)
939 module_exit(sfq_module_exit)
940 MODULE_LICENSE("GPL");
941